diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 22c348acaf341..e8a2dd4879f20 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -244,6 +244,9 @@ def _outer_indexer(self, left, right): str = CachedAccessor("str", StringMethods) + # -------------------------------------------------------------------- + # Constructors + def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=None, tupleize_cols=True, **kwargs): @@ -518,6 +521,19 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): setattr(result, k, v) return result._reset_identity() + @cache_readonly + def _constructor(self): + return type(self) + + # -------------------------------------------------------------------- + # Index Internals Methods + + def _get_attributes_dict(self): + """ + Return an attributes dict for my class. + """ + return {k: getattr(self, k, None) for k in self._attributes} + _index_shared_docs['_shallow_copy'] = """ Create a new Index with the same class as the caller, don't copy the data, use the same object attributes with passed in attributes taking @@ -608,42 +624,6 @@ def _update_inplace(self, result, **kwargs): # guard when called from IndexOpsMixin raise TypeError("Index can't be updated inplace") - def _sort_levels_monotonic(self): - """ - Compat with MultiIndex. - """ - return self - - _index_shared_docs['_get_grouper_for_level'] = """ - Get index grouper corresponding to an index level - - Parameters - ---------- - mapper: Group mapping function or None - Function mapping index values to groups - level : int or None - Index level - - Returns - ------- - grouper : Index - Index of values to group on - labels : ndarray of int or None - Array of locations in level_index - uniques : Index or None - Index of unique values for level - """ - - @Appender(_index_shared_docs['_get_grouper_for_level']) - def _get_grouper_for_level(self, mapper, level=None): - assert level is None or level == 0 - if mapper is None: - grouper = self - else: - grouper = self.map(mapper) - - return grouper, None, None - def is_(self, other): """ More flexible, faster check like ``is`` but that works through views. @@ -671,6 +651,17 @@ def _reset_identity(self): self._id = _Identity() return self + def _cleanup(self): + self._engine.clear_mapping() + + @cache_readonly + def _engine(self): + # property, for now, slow to look up + return self._engine_type(lambda: self._ndarray_values, len(self)) + + # -------------------------------------------------------------------- + # Array-Like Methods + # ndarray compat def __len__(self): """ @@ -709,97 +700,129 @@ def dtype_str(self): """ return str(self.dtype) - @property - def values(self): - """ - Return the underlying data as an ndarray. + def ravel(self, order='C'): """ - return self._data.view(np.ndarray) + Return an ndarray of the flattened values of the underlying data. - @property - def _values(self): - # type: () -> Union[ExtensionArray, Index, np.ndarray] - # TODO(EA): remove index types as they become extension arrays + See Also + -------- + numpy.ndarray.ravel """ - The best array representation. - - This is an ndarray, ExtensionArray, or Index subclass. This differs - from ``_ndarray_values``, which always returns an ndarray. + return self._ndarray_values.ravel(order=order) - Both ``_values`` and ``_ndarray_values`` are consistent between - ``Series`` and ``Index``. + def view(self, cls=None): - It may differ from the public '.values' method. + # we need to see if we are subclassing an + # index type here + if cls is not None and not hasattr(cls, '_typ'): + result = self._data.view(cls) + else: + result = self._shallow_copy() + if isinstance(result, Index): + result._id = self._id + return result - index | values | _values | _ndarray_values | - ----------------- | --------------- | ------------- | --------------- | - Index | ndarray | ndarray | ndarray | - CategoricalIndex | Categorical | Categorical | ndarray[int] | - DatetimeIndex | ndarray[M8ns] | ndarray[M8ns] | ndarray[M8ns] | - DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] | ndarray[M8ns] | - PeriodIndex | ndarray[object] | PeriodArray | ndarray[int] | - IntervalIndex | IntervalArray | IntervalArray | ndarray[object] | + _index_shared_docs['astype'] = """ + Create an Index with values cast to dtypes. The class of a new Index + is determined by dtype. When conversion is impossible, a ValueError + exception is raised. - See Also - -------- - values - _ndarray_values - """ - return self.values + Parameters + ---------- + dtype : numpy dtype or pandas type + copy : bool, default True + By default, astype always returns a newly allocated object. + If copy is set to False and internal requirements on dtype are + satisfied, the original data is used to create a new Index + or the original Index is returned. - def get_values(self): + .. versionadded:: 0.19.0 """ - Return `Index` data as an `numpy.ndarray`. - Returns - ------- - numpy.ndarray - A one-dimensional numpy array of the `Index` values. + @Appender(_index_shared_docs['astype']) + def astype(self, dtype, copy=True): + if is_dtype_equal(self.dtype, dtype): + return self.copy() if copy else self - See Also - -------- - Index.values : The attribute that get_values wraps. + elif is_categorical_dtype(dtype): + from .category import CategoricalIndex + return CategoricalIndex(self.values, name=self.name, dtype=dtype, + copy=copy) - Examples - -------- - Getting the `Index` values of a `DataFrame`: + elif is_extension_array_dtype(dtype): + return Index(np.asarray(self), dtype=dtype, copy=copy) - >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - ... index=['a', 'b', 'c'], columns=['A', 'B', 'C']) - >>> df - A B C - a 1 2 3 - b 4 5 6 - c 7 8 9 - >>> df.index.get_values() - array(['a', 'b', 'c'], dtype=object) + try: + if is_datetime64tz_dtype(dtype): + from pandas import DatetimeIndex + return DatetimeIndex(self.values, name=self.name, dtype=dtype, + copy=copy) + return Index(self.values.astype(dtype, copy=copy), name=self.name, + dtype=dtype) + except (TypeError, ValueError): + msg = 'Cannot cast {name} to dtype {dtype}' + raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) - Standalone `Index` values: + _index_shared_docs['take'] = """ + Return a new %(klass)s of the values selected by the indices. - >>> idx = pd.Index(['1', '2', '3']) - >>> idx.get_values() - array(['1', '2', '3'], dtype=object) + For internal compatibility with numpy arrays. - `MultiIndex` arrays also have only one dimension: + Parameters + ---------- + indices : list + Indices to be taken + axis : int, optional + The axis over which to select values, always 0. + allow_fill : bool, default True + fill_value : bool, default None + If allow_fill=True and fill_value is not None, indices specified by + -1 is regarded as NA. If Index doesn't hold NA, raise ValueError - >>> midx = pd.MultiIndex.from_arrays([[1, 2, 3], ['a', 'b', 'c']], - ... names=('number', 'letter')) - >>> midx.get_values() - array([(1, 'a'), (2, 'b'), (3, 'c')], dtype=object) - >>> midx.get_values().ndim - 1 + See Also + -------- + numpy.ndarray.take """ - return self.values - @Appender(IndexOpsMixin.memory_usage.__doc__) - def memory_usage(self, deep=False): - result = super(Index, self).memory_usage(deep=deep) + @Appender(_index_shared_docs['take'] % _index_doc_kwargs) + def take(self, indices, axis=0, allow_fill=True, + fill_value=None, **kwargs): + if kwargs: + nv.validate_take(tuple(), kwargs) + indices = ensure_platform_int(indices) + if self._can_hold_na: + taken = self._assert_take_fillable(self.values, indices, + allow_fill=allow_fill, + fill_value=fill_value, + na_value=self._na_value) + else: + if allow_fill and fill_value is not None: + msg = 'Unable to fill values because {0} cannot contain NA' + raise ValueError(msg.format(self.__class__.__name__)) + taken = self.values.take(indices) + return self._shallow_copy(taken) - # include our engine hashtable - result += self._engine.sizeof(deep=deep) - return result + def _assert_take_fillable(self, values, indices, allow_fill=True, + fill_value=None, na_value=np.nan): + """ + Internal method to handle NA filling of take. + """ + indices = ensure_platform_int(indices) + + # only fill if we are passing a non-None fill_value + if allow_fill and fill_value is not None: + if (indices < -1).any(): + msg = ('When allow_fill=True and fill_value is not None, ' + 'all indices must be >= -1') + raise ValueError(msg) + taken = algos.take(values, + indices, + allow_fill=allow_fill, + fill_value=na_value) + else: + taken = values.take(indices) + return taken - # ops compat def repeat(self, repeats, *args, **kwargs): """ Repeat elements of an Index. @@ -838,185 +861,28 @@ def repeat(self, repeats, *args, **kwargs): nv.validate_repeat(args, kwargs) return self._shallow_copy(self._values.repeat(repeats)) - _index_shared_docs['where'] = """ - Return an Index of same shape as self and whose corresponding - entries are from self where cond is True and otherwise are from - other. + # -------------------------------------------------------------------- + # Copying Methods - .. versionadded:: 0.19.0 + _index_shared_docs['copy'] = """ + Make a copy of this object. Name and dtype sets those attributes on + the new object. Parameters ---------- - cond : boolean array-like with the same length as self - other : scalar, or array-like - """ + name : string, optional + deep : boolean, default False + dtype : numpy dtype or pandas type - @Appender(_index_shared_docs['where']) - def where(self, cond, other=None): - if other is None: - other = self._na_value + Returns + ------- + copy : Index - dtype = self.dtype - values = self.values - - if is_bool(other) or is_bool_dtype(other): - - # bools force casting - values = values.astype(object) - dtype = None - - values = np.where(cond, values, other) - - if self._is_numeric_dtype and np.any(isna(values)): - # We can't coerce to the numeric dtype of "self" (unless - # it's float) if there are NaN values in our output. - dtype = None - - return self._shallow_copy_with_infer(values, dtype=dtype) - - def ravel(self, order='C'): - """ - Return an ndarray of the flattened values of the underlying data. - - See Also - -------- - numpy.ndarray.ravel - """ - return self._ndarray_values.ravel(order=order) - - # construction helpers - @classmethod - def _try_convert_to_int_index(cls, data, copy, name, dtype): - """ - Attempt to convert an array of data into an integer index. - - Parameters - ---------- - data : The data to convert. - copy : Whether to copy the data or not. - name : The name of the index returned. - - Returns - ------- - int_index : data converted to either an Int64Index or a - UInt64Index - - Raises - ------ - ValueError if the conversion was not successful. - """ - - from .numeric import Int64Index, UInt64Index - if not is_unsigned_integer_dtype(dtype): - # skip int64 conversion attempt if uint-like dtype is passed, as - # this could return Int64Index when UInt64Index is what's desrired - try: - res = data.astype('i8', copy=False) - if (res == data).all(): - return Int64Index(res, copy=copy, name=name) - except (OverflowError, TypeError, ValueError): - pass - - # Conversion to int64 failed (possibly due to overflow) or was skipped, - # so let's try now with uint64. - try: - res = data.astype('u8', copy=False) - if (res == data).all(): - return UInt64Index(res, copy=copy, name=name) - except (OverflowError, TypeError, ValueError): - pass - - raise ValueError - - @classmethod - def _scalar_data_error(cls, data): - raise TypeError('{0}(...) must be called with a collection of some ' - 'kind, {1} was passed'.format(cls.__name__, - repr(data))) - - @classmethod - def _string_data_error(cls, data): - raise TypeError('String dtype not supported, you may need ' - 'to explicitly cast to a numeric type') - - @classmethod - def _coerce_to_ndarray(cls, data): - """ - Coerces data to ndarray. - - Converts other iterables to list first and then to array. - Does not touch ndarrays. - - Raises - ------ - TypeError - When the data passed in is a scalar. - """ - - if not isinstance(data, (np.ndarray, Index)): - if data is None or is_scalar(data): - cls._scalar_data_error(data) - - # other iterable of some kind - if not isinstance(data, (ABCSeries, list, tuple)): - data = list(data) - data = np.asarray(data) - return data - - def _get_attributes_dict(self): - """ - Return an attributes dict for my class. - """ - return {k: getattr(self, k, None) for k in self._attributes} - - def view(self, cls=None): - - # we need to see if we are subclassing an - # index type here - if cls is not None and not hasattr(cls, '_typ'): - result = self._data.view(cls) - else: - result = self._shallow_copy() - if isinstance(result, Index): - result._id = self._id - return result - - def _coerce_scalar_to_index(self, item): - """ - We need to coerce a scalar to a compat for our index type. - - Parameters - ---------- - item : scalar item to coerce - """ - dtype = self.dtype - - if self._is_numeric_dtype and isna(item): - # We can't coerce to the numeric dtype of "self" (unless - # it's float) if there are NaN values in our output. - dtype = None - - return Index([item], dtype=dtype, **self._get_attributes_dict()) - - _index_shared_docs['copy'] = """ - Make a copy of this object. Name and dtype sets those attributes on - the new object. - - Parameters - ---------- - name : string, optional - deep : boolean, default False - dtype : numpy dtype or pandas type - - Returns - ------- - copy : Index - - Notes - ----- - In most cases, there should be no functional difference from using - ``deep``, but if ``deep`` is passed it will attempt to deepcopy. - """ + Notes + ----- + In most cases, there should be no functional difference from using + ``deep``, but if ``deep`` is passed it will attempt to deepcopy. + """ @Appender(_index_shared_docs['copy']) def copy(self, name=None, deep=False, dtype=None, **kwargs): @@ -1047,24 +913,8 @@ def __deepcopy__(self, memo=None): memo = {} return self.copy(deep=True) - def _validate_names(self, name=None, names=None, deep=False): - """ - Handles the quirks of having a singular 'name' parameter for general - Index and plural 'names' parameter for MultiIndex. - """ - from copy import deepcopy - if names is not None and name is not None: - raise TypeError("Can only provide one of `names` and `name`") - elif names is None and name is None: - return deepcopy(self.names) if deep else self.names - elif names is not None: - if not is_list_like(names): - raise TypeError("Must pass list-like as `names`.") - return names - else: - if not is_list_like(name): - return [name] - return name + # -------------------------------------------------------------------- + # Rendering Methods def __unicode__(self): """ @@ -1125,64 +975,192 @@ def _format_attrs(self): """ return format_object_attrs(self) - def to_flat_index(self): - """ - Identity method. + def _mpl_repr(self): + # how to represent ourselves to matplotlib + return self.values - .. versionadded:: 0.24.0 + def format(self, name=False, formatter=None, **kwargs): + """ + Render a string representation of the Index. + """ + header = [] + if name: + header.append(pprint_thing(self.name, + escape_chars=('\t', '\r', '\n')) if + self.name is not None else '') - This is implemented for compatability with subclass implementations - when chaining. + if formatter is not None: + return header + list(self.map(formatter)) - Returns - ------- - pd.Index - Caller. + return self._format_with_header(header, **kwargs) - See Also - -------- - MultiIndex.to_flat_index : Subclass implementation. - """ - return self + def _format_with_header(self, header, na_rep='NaN', **kwargs): + values = self.values - def to_series(self, index=None, name=None): - """ - Create a Series with both index and values equal to the index keys - useful with map for returning an indexer based on an index. + from pandas.io.formats.format import format_array - Parameters - ---------- - index : Index, optional - index of resulting Series. If None, defaults to original index - name : string, optional - name of resulting Series. If None, defaults to name of original - index + if is_categorical_dtype(values.dtype): + values = np.array(values) - Returns - ------- - Series : dtype will be based on the type of the Index values. - """ + elif is_object_dtype(values.dtype): + values = lib.maybe_convert_objects(values, safe=1) - from pandas import Series + if is_object_dtype(values.dtype): + result = [pprint_thing(x, escape_chars=('\t', '\r', '\n')) + for x in values] - if index is None: - index = self._shallow_copy() - if name is None: - name = self.name + # could have nans + mask = isna(values) + if mask.any(): + result = np.array(result) + result[mask] = na_rep + result = result.tolist() - return Series(self.values.copy(), index=index, name=name) + else: + result = _trim_front(format_array(values, None, justify='left')) + return header + result - def to_frame(self, index=True, name=None): + def to_native_types(self, slicer=None, **kwargs): """ - Create a DataFrame with a column containing the Index. - - .. versionadded:: 0.24.0 + Format specified values of `self` and return them. Parameters ---------- - index : boolean, default True - Set the index of the returned DataFrame as the original Index. - + slicer : int, array-like + An indexer into `self` that specifies which values + are used in the formatting process. + kwargs : dict + Options for specifying how the values should be formatted. + These options include the following: + + 1) na_rep : str + The value that serves as a placeholder for NULL values + 2) quoting : bool or None + Whether or not there are quoted values in `self` + 3) date_format : str + The format used to represent date-like values + """ + + values = self + if slicer is not None: + values = values[slicer] + return values._format_native_types(**kwargs) + + def _format_native_types(self, na_rep='', quoting=None, **kwargs): + """ + Actually format specific types of the index. + """ + mask = isna(self) + if not self.is_object() and not quoting: + values = np.asarray(self).astype(str) + else: + values = np.array(self, dtype=object, copy=True) + + values[mask] = na_rep + return values + + def _summary(self, name=None): + """ + Return a summarized representation. + + Parameters + ---------- + name : str + name to use in the summary representation + + Returns + ------- + String with a summarized representation of the index + """ + if len(self) > 0: + head = self[0] + if (hasattr(head, 'format') and + not isinstance(head, compat.string_types)): + head = head.format() + tail = self[-1] + if (hasattr(tail, 'format') and + not isinstance(tail, compat.string_types)): + tail = tail.format() + index_summary = ', %s to %s' % (pprint_thing(head), + pprint_thing(tail)) + else: + index_summary = '' + + if name is None: + name = type(self).__name__ + return '%s: %s entries%s' % (name, len(self), index_summary) + + def summary(self, name=None): + """ + Return a summarized representation. + + .. deprecated:: 0.23.0 + """ + warnings.warn("'summary' is deprecated and will be removed in a " + "future version.", FutureWarning, stacklevel=2) + return self._summary(name) + + # -------------------------------------------------------------------- + # Conversion Methods + + def to_flat_index(self): + """ + Identity method. + + .. versionadded:: 0.24.0 + + This is implemented for compatability with subclass implementations + when chaining. + + Returns + ------- + pd.Index + Caller. + + See Also + -------- + MultiIndex.to_flat_index : Subclass implementation. + """ + return self + + def to_series(self, index=None, name=None): + """ + Create a Series with both index and values equal to the index keys + useful with map for returning an indexer based on an index. + + Parameters + ---------- + index : Index, optional + index of resulting Series. If None, defaults to original index + name : string, optional + name of resulting Series. If None, defaults to name of original + index + + Returns + ------- + Series : dtype will be based on the type of the Index values. + """ + + from pandas import Series + + if index is None: + index = self._shallow_copy() + if name is None: + name = self.name + + return Series(self.values.copy(), index=index, name=name) + + def to_frame(self, index=True, name=None): + """ + Create a DataFrame with a column containing the Index. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + index : boolean, default True + Set the index of the returned DataFrame as the original Index. + name : object, default None The passed name should substitute for the index name (if it has one). @@ -1233,83 +1211,27 @@ def to_frame(self, index=True, name=None): result.index = self return result - _index_shared_docs['astype'] = """ - Create an Index with values cast to dtypes. The class of a new Index - is determined by dtype. When conversion is impossible, a ValueError - exception is raised. - - Parameters - ---------- - dtype : numpy dtype or pandas type - copy : bool, default True - By default, astype always returns a newly allocated object. - If copy is set to False and internal requirements on dtype are - satisfied, the original data is used to create a new Index - or the original Index is returned. - - .. versionadded:: 0.19.0 - """ - - @Appender(_index_shared_docs['astype']) - def astype(self, dtype, copy=True): - if is_dtype_equal(self.dtype, dtype): - return self.copy() if copy else self - - elif is_categorical_dtype(dtype): - from .category import CategoricalIndex - return CategoricalIndex(self.values, name=self.name, dtype=dtype, - copy=copy) - - elif is_extension_array_dtype(dtype): - return Index(np.asarray(self), dtype=dtype, copy=copy) - - try: - if is_datetime64tz_dtype(dtype): - from pandas import DatetimeIndex - return DatetimeIndex(self.values, name=self.name, dtype=dtype, - copy=copy) - return Index(self.values.astype(dtype, copy=copy), name=self.name, - dtype=dtype) - except (TypeError, ValueError): - msg = 'Cannot cast {name} to dtype {dtype}' - raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) + # -------------------------------------------------------------------- + # Name-Centric Methods - def _to_safe_for_reshape(self): + def _validate_names(self, name=None, names=None, deep=False): """ - Convert to object if we are a categorical. + Handles the quirks of having a singular 'name' parameter for general + Index and plural 'names' parameter for MultiIndex. """ - return self - - def _assert_can_do_setop(self, other): - if not is_list_like(other): - raise TypeError('Input must be Index or array-like') - return True - - def _convert_can_do_setop(self, other): - if not isinstance(other, Index): - other = Index(other, name=self.name) - result_name = self.name + from copy import deepcopy + if names is not None and name is not None: + raise TypeError("Can only provide one of `names` and `name`") + elif names is None and name is None: + return deepcopy(self.names) if deep else self.names + elif names is not None: + if not is_list_like(names): + raise TypeError("Must pass list-like as `names`.") + return names else: - result_name = get_op_result_name(self, other) - return other, result_name - - def _convert_for_op(self, value): - """ - Convert value to be insertable to ndarray. - """ - return value - - def _assert_can_do_op(self, value): - """ - Check value is valid for scalar op. - """ - if not is_scalar(value): - msg = "'value' must be a scalar, passed: {0}" - raise TypeError(msg.format(type(value).__name__)) - - @property - def nlevels(self): - return 1 + if not is_list_like(name): + return [name] + return name def _get_names(self): return FrozenList((self.name, )) @@ -1468,60 +1390,193 @@ def rename(self, name, inplace=False): """ return self.set_names([name], inplace=inplace) + # -------------------------------------------------------------------- + # Level-Centric Methods + @property - def _has_complex_internals(self): - # to disable groupby tricks in MultiIndex - return False + def nlevels(self): + return 1 - def _summary(self, name=None): + def _sort_levels_monotonic(self): """ - Return a summarized representation. - - Parameters - ---------- - name : str - name to use in the summary representation + Compat with MultiIndex. + """ + return self - Returns - ------- - String with a summarized representation of the index + def _validate_index_level(self, level): """ - if len(self) > 0: - head = self[0] - if (hasattr(head, 'format') and - not isinstance(head, compat.string_types)): - head = head.format() - tail = self[-1] - if (hasattr(tail, 'format') and - not isinstance(tail, compat.string_types)): - tail = tail.format() - index_summary = ', %s to %s' % (pprint_thing(head), - pprint_thing(tail)) - else: - index_summary = '' + Validate index level. - if name is None: - name = type(self).__name__ - return '%s: %s entries%s' % (name, len(self), index_summary) + For single-level Index getting level number is a no-op, but some + verification must be done like in MultiIndex. - def summary(self, name=None): """ - Return a summarized representation. + if isinstance(level, int): + if level < 0 and level != -1: + raise IndexError("Too many levels: Index has only 1 level," + " %d is not a valid level number" % (level, )) + elif level > 0: + raise IndexError("Too many levels:" + " Index has only 1 level, not %d" % + (level + 1)) + elif level != self.name: + raise KeyError('Level %s must be same as name (%s)' % + (level, self.name)) - .. deprecated:: 0.23.0 + def _get_level_number(self, level): + self._validate_index_level(level) + return 0 + + def sortlevel(self, level=None, ascending=True, sort_remaining=None): """ - warnings.warn("'summary' is deprecated and will be removed in a " - "future version.", FutureWarning, stacklevel=2) - return self._summary(name) + For internal compatibility with with the Index API. - def _mpl_repr(self): - # how to represent ourselves to matplotlib - return self.values + Sort the Index. This is for compat with MultiIndex - _na_value = np.nan - """The expected NA value to use with this index.""" + Parameters + ---------- + ascending : boolean, default True + False to sort in descending order + + level, sort_remaining are compat parameters + + Returns + ------- + sorted_index : Index + """ + return self.sort_values(return_indexer=True, ascending=ascending) + + def _get_level_values(self, level): + """ + Return an Index of values for requested level. + + This is primarily useful to get an individual level of values from a + MultiIndex, but is provided on Index as well for compatability. + + Parameters + ---------- + level : int or str + It is either the integer position or the name of the level. + + Returns + ------- + values : Index + Calling object, as there is only one level in the Index. + + See Also + -------- + MultiIndex.get_level_values : Get values for a level of a MultiIndex. + + Notes + ----- + For Index, level should be 0, since there are no multiple levels. + + Examples + -------- + + >>> idx = pd.Index(list('abc')) + >>> idx + Index(['a', 'b', 'c'], dtype='object') + + Get level values by supplying `level` as integer: + + >>> idx.get_level_values(0) + Index(['a', 'b', 'c'], dtype='object') + """ + self._validate_index_level(level) + return self + + get_level_values = _get_level_values + + def droplevel(self, level=0): + """ + Return index with requested level(s) removed. + + If resulting index has only 1 level left, the result will be + of Index type, not MultiIndex. + + .. versionadded:: 0.23.1 (support for non-MultiIndex) + + Parameters + ---------- + level : int, str, or list-like, default 0 + If a string is given, must be the name of a level + If list-like, elements must be names or indexes of levels. + + Returns + ------- + index : Index or MultiIndex + """ + if not isinstance(level, (tuple, list)): + level = [level] + + levnums = sorted(self._get_level_number(lev) for lev in level)[::-1] + + if len(level) == 0: + return self + if len(level) >= self.nlevels: + raise ValueError("Cannot remove {} levels from an index with {} " + "levels: at least one level must be " + "left.".format(len(level), self.nlevels)) + # The two checks above guarantee that here self is a MultiIndex + + new_levels = list(self.levels) + new_labels = list(self.labels) + new_names = list(self.names) + + for i in levnums: + new_levels.pop(i) + new_labels.pop(i) + new_names.pop(i) + + if len(new_levels) == 1: + + # set nan if needed + mask = new_labels[0] == -1 + result = new_levels[0].take(new_labels[0]) + if mask.any(): + result = result.putmask(mask, np.nan) + + result.name = new_names[0] + return result + else: + from .multi import MultiIndex + return MultiIndex(levels=new_levels, labels=new_labels, + names=new_names, verify_integrity=False) + + _index_shared_docs['_get_grouper_for_level'] = """ + Get index grouper corresponding to an index level + + Parameters + ---------- + mapper: Group mapping function or None + Function mapping index values to groups + level : int or None + Index level + + Returns + ------- + grouper : Index + Index of values to group on + labels : ndarray of int or None + Array of locations in level_index + uniques : Index or None + Index of unique values for level + """ + + @Appender(_index_shared_docs['_get_grouper_for_level']) + def _get_grouper_for_level(self, mapper, level=None): + assert level is None or level == 0 + if mapper is None: + grouper = self + else: + grouper = self.map(mapper) + + return grouper, None, None + + # -------------------------------------------------------------------- + # Introspection Methods - # introspection @property def is_monotonic(self): """ @@ -1671,234 +1726,385 @@ def is_mixed(self): def holds_integer(self): return self.inferred_type in ['integer', 'mixed-integer'] - _index_shared_docs['_convert_scalar_indexer'] = """ - Convert a scalar indexer. + @cache_readonly + def inferred_type(self): + """ + Return a string of the type inferred from the values. + """ + return lib.infer_dtype(self) - Parameters - ---------- - key : label of the slice bound - kind : {'ix', 'loc', 'getitem', 'iloc'} or None - """ + @cache_readonly + def is_all_dates(self): + if self._data is None: + return False + return is_datetime_array(ensure_object(self.values)) - @Appender(_index_shared_docs['_convert_scalar_indexer']) - def _convert_scalar_indexer(self, key, kind=None): - assert kind in ['ix', 'loc', 'getitem', 'iloc', None] + # -------------------------------------------------------------------- + # Pickle Methods - if kind == 'iloc': - return self._validate_indexer('positional', key, kind) + def __reduce__(self): + d = dict(data=self._data) + d.update(self._get_attributes_dict()) + return _new_Index, (self.__class__, d), None - if len(self) and not isinstance(self, ABCMultiIndex,): + def __setstate__(self, state): + """ + Necessary for making this object picklable. + """ - # we can raise here if we are definitive that this - # is positional indexing (eg. .ix on with a float) - # or label indexing if we are using a type able - # to be represented in the index + if isinstance(state, dict): + self._data = state.pop('data') + for k, v in compat.iteritems(state): + setattr(self, k, v) - if kind in ['getitem', 'ix'] and is_float(key): - if not self.is_floating(): - return self._invalid_indexer('label', key) + elif isinstance(state, tuple): - elif kind in ['loc'] and is_float(key): + if len(state) == 2: + nd_state, own_state = state + data = np.empty(nd_state[1], dtype=nd_state[2]) + np.ndarray.__setstate__(data, nd_state) + self.name = own_state[0] - # we want to raise KeyError on string/mixed here - # technically we *could* raise a TypeError - # on anything but mixed though - if self.inferred_type not in ['floating', - 'mixed-integer-float', - 'string', - 'unicode', - 'mixed']: - return self._invalid_indexer('label', key) + else: # pragma: no cover + data = np.empty(state) + np.ndarray.__setstate__(data, state) - elif kind in ['loc'] and is_integer(key): - if not self.holds_integer(): - return self._invalid_indexer('label', key) + self._data = data + self._reset_identity() + else: + raise Exception("invalid pickle state") - return key + _unpickle_compat = __setstate__ - _index_shared_docs['_convert_slice_indexer'] = """ - Convert a slice indexer. + # -------------------------------------------------------------------- + # Null Handling Methods - By definition, these are labels unless 'iloc' is passed in. - Floats are not allowed as the start, step, or stop of the slice. + _na_value = np.nan + """The expected NA value to use with this index.""" - Parameters - ---------- - key : label of the slice bound - kind : {'ix', 'loc', 'getitem', 'iloc'} or None - """ + @cache_readonly + def _isnan(self): + """ + Return if each value is NaN. + """ + if self._can_hold_na: + return isna(self) + else: + # shouldn't reach to this condition by checking hasnans beforehand + values = np.empty(len(self), dtype=np.bool_) + values.fill(False) + return values - @Appender(_index_shared_docs['_convert_slice_indexer']) - def _convert_slice_indexer(self, key, kind=None): - assert kind in ['ix', 'loc', 'getitem', 'iloc', None] + @cache_readonly + def _nan_idxs(self): + if self._can_hold_na: + w, = self._isnan.nonzero() + return w + else: + return np.array([], dtype=np.int64) - # if we are not a slice, then we are done - if not isinstance(key, slice): - return key + @cache_readonly + def hasnans(self): + """ + Return if I have any nans; enables various perf speedups. + """ + if self._can_hold_na: + return bool(self._isnan.any()) + else: + return False - # validate iloc - if kind == 'iloc': - return slice(self._validate_indexer('slice', key.start, kind), - self._validate_indexer('slice', key.stop, kind), - self._validate_indexer('slice', key.step, kind)) + def isna(self): + """ + Detect missing values. - # potentially cast the bounds to integers - start, stop, step = key.start, key.stop, key.step + Return a boolean same-sized object indicating if the values are NA. + NA values, such as ``None``, :attr:`numpy.NaN` or :attr:`pd.NaT`, get + mapped to ``True`` values. + Everything else get mapped to ``False`` values. Characters such as + empty strings `''` or :attr:`numpy.inf` are not considered NA values + (unless you set ``pandas.options.mode.use_inf_as_na = True``). - # figure out if this is a positional indexer - def is_int(v): - return v is None or is_integer(v) + .. versionadded:: 0.20.0 - is_null_slicer = start is None and stop is None - is_index_slice = is_int(start) and is_int(stop) - is_positional = is_index_slice and not self.is_integer() + Returns + ------- + numpy.ndarray + A boolean array of whether my values are NA - if kind == 'getitem': - """ - called from the getitem slicers, validate that we are in fact - integers - """ - if self.is_integer() or is_index_slice: - return slice(self._validate_indexer('slice', key.start, kind), - self._validate_indexer('slice', key.stop, kind), - self._validate_indexer('slice', key.step, kind)) + See Also + -------- + pandas.Index.notna : Boolean inverse of isna. + pandas.Index.dropna : Omit entries with missing values. + pandas.isna : Top-level isna. + Series.isna : Detect missing values in Series object. - # convert the slice to an indexer here + Examples + -------- + Show which entries in a pandas.Index are NA. The result is an + array. - # if we are mixed and have integers - try: - if is_positional and self.is_mixed(): - # Validate start & stop - if start is not None: - self.get_loc(start) - if stop is not None: - self.get_loc(stop) - is_positional = False - except KeyError: - if self.inferred_type == 'mixed-integer-float': - raise + >>> idx = pd.Index([5.2, 6.0, np.NaN]) + >>> idx + Float64Index([5.2, 6.0, nan], dtype='float64') + >>> idx.isna() + array([False, False, True], dtype=bool) - if is_null_slicer: - indexer = key - elif is_positional: - indexer = key - else: - try: - indexer = self.slice_indexer(start, stop, step, kind=kind) - except Exception: - if is_index_slice: - if self.is_integer(): - raise - else: - indexer = key - else: - raise + Empty strings are not considered NA values. None is considered an NA + value. - return indexer + >>> idx = pd.Index(['black', '', 'red', None]) + >>> idx + Index(['black', '', 'red', None], dtype='object') + >>> idx.isna() + array([False, False, False, True], dtype=bool) - def _convert_listlike_indexer(self, keyarr, kind=None): - """ - Parameters - ---------- - keyarr : list-like - Indexer to convert. + For datetimes, `NaT` (Not a Time) is considered as an NA value. - Returns - ------- - tuple (indexer, keyarr) - indexer is an ndarray or None if cannot convert - keyarr are tuple-safe keys + >>> idx = pd.DatetimeIndex([pd.Timestamp('1940-04-25'), + ... pd.Timestamp(''), None, pd.NaT]) + >>> idx + DatetimeIndex(['1940-04-25', 'NaT', 'NaT', 'NaT'], + dtype='datetime64[ns]', freq=None) + >>> idx.isna() + array([False, True, True, True], dtype=bool) """ - if isinstance(keyarr, Index): - keyarr = self._convert_index_indexer(keyarr) - else: - keyarr = self._convert_arr_indexer(keyarr) + return self._isnan + isnull = isna - indexer = self._convert_list_indexer(keyarr, kind=kind) - return indexer, keyarr + def notna(self): + """ + Detect existing (non-missing) values. - _index_shared_docs['_convert_arr_indexer'] = """ - Convert an array-like indexer to the appropriate dtype. + Return a boolean same-sized object indicating if the values are not NA. + Non-missing values get mapped to ``True``. Characters such as empty + strings ``''`` or :attr:`numpy.inf` are not considered NA values + (unless you set ``pandas.options.mode.use_inf_as_na = True``). + NA values, such as None or :attr:`numpy.NaN`, get mapped to ``False`` + values. - Parameters - ---------- - keyarr : array-like - Indexer to convert. + .. versionadded:: 0.20.0 Returns ------- - converted_keyarr : array-like - """ + numpy.ndarray + Boolean array to indicate which entries are not NA. - @Appender(_index_shared_docs['_convert_arr_indexer']) - def _convert_arr_indexer(self, keyarr): - keyarr = com.asarray_tuplesafe(keyarr) - return keyarr + See Also + -------- + Index.notnull : Alias of notna. + Index.isna: Inverse of notna. + pandas.notna : Top-level notna. - _index_shared_docs['_convert_index_indexer'] = """ - Convert an Index indexer to the appropriate dtype. + Examples + -------- + Show which entries in an Index are not NA. The result is an + array. + + >>> idx = pd.Index([5.2, 6.0, np.NaN]) + >>> idx + Float64Index([5.2, 6.0, nan], dtype='float64') + >>> idx.notna() + array([ True, True, False]) + + Empty strings are not considered NA values. None is considered a NA + value. + + >>> idx = pd.Index(['black', '', 'red', None]) + >>> idx + Index(['black', '', 'red', None], dtype='object') + >>> idx.notna() + array([ True, True, True, False]) + """ + return ~self.isna() + notnull = notna + + _index_shared_docs['fillna'] = """ + Fill NA/NaN values with the specified value Parameters ---------- - keyarr : Index (or sub-class) - Indexer to convert. + value : scalar + Scalar value to use to fill holes (e.g. 0). + This value cannot be a list-likes. + downcast : dict, default is None + a dict of item->dtype of what to downcast if possible, + or the string 'infer' which will try to downcast to an appropriate + equal type (e.g. float64 to int64 if possible) Returns ------- - converted_keyarr : Index (or sub-class) - """ + filled : %(klass)s + """ - @Appender(_index_shared_docs['_convert_index_indexer']) - def _convert_index_indexer(self, keyarr): - return keyarr + @Appender(_index_shared_docs['fillna']) + def fillna(self, value=None, downcast=None): + self._assert_can_do_op(value) + if self.hasnans: + result = self.putmask(self._isnan, value) + if downcast is None: + # no need to care metadata other than name + # because it can't have freq if + return Index(result, name=self.name) + return self._shallow_copy() - _index_shared_docs['_convert_list_indexer'] = """ - Convert a list-like indexer to the appropriate dtype. + _index_shared_docs['dropna'] = """ + Return Index without NA/NaN values Parameters ---------- - keyarr : Index (or sub-class) - Indexer to convert. - kind : iloc, ix, loc, optional + how : {'any', 'all'}, default 'any' + If the Index is a MultiIndex, drop the value when any or all levels + are NaN. Returns ------- - positional indexer or None - """ + valid : Index + """ - @Appender(_index_shared_docs['_convert_list_indexer']) - def _convert_list_indexer(self, keyarr, kind=None): - if (kind in [None, 'iloc', 'ix'] and - is_integer_dtype(keyarr) and not self.is_floating() and - not isinstance(keyarr, ABCPeriodIndex)): + @Appender(_index_shared_docs['dropna']) + def dropna(self, how='any'): + if how not in ('any', 'all'): + raise ValueError("invalid how option: {0}".format(how)) - if self.inferred_type == 'mixed-integer': - indexer = self.get_indexer(keyarr) - if (indexer >= 0).all(): - return indexer - # missing values are flagged as -1 by get_indexer and negative - # indices are already converted to positive indices in the - # above if-statement, so the negative flags are changed to - # values outside the range of indices so as to trigger an - # IndexError in maybe_convert_indices - indexer[indexer < 0] = len(self) - from pandas.core.indexing import maybe_convert_indices - return maybe_convert_indices(indexer, len(self)) + if self.hasnans: + return self._shallow_copy(self.values[~self._isnan]) + return self._shallow_copy() - elif not self.inferred_type == 'integer': - keyarr = np.where(keyarr < 0, len(self) + keyarr, keyarr) - return keyarr + # -------------------------------------------------------------------- + # Uniqueness Methods - return None + _index_shared_docs['index_unique'] = ( + """ + Return unique values in the index. Uniques are returned in order + of appearance, this does NOT sort. - def _invalid_indexer(self, form, key): + Parameters + ---------- + level : int or str, optional, default None + Only return values from specified level (for MultiIndex) + + .. versionadded:: 0.23.0 + + Returns + ------- + Index without duplicates + + See Also + -------- + unique + Series.unique + """) + + @Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs) + def unique(self, level=None): + if level is not None: + self._validate_index_level(level) + result = super(Index, self).unique() + return self._shallow_copy(result) + + def drop_duplicates(self, keep='first'): """ - Consistent invalid indexer message. + Return Index with duplicate values removed. + + Parameters + ---------- + keep : {'first', 'last', ``False``}, default 'first' + - 'first' : Drop duplicates except for the first occurrence. + - 'last' : Drop duplicates except for the last occurrence. + - ``False`` : Drop all duplicates. + + Returns + ------- + deduplicated : Index + + See Also + -------- + Series.drop_duplicates : Equivalent method on Series. + DataFrame.drop_duplicates : Equivalent method on DataFrame. + Index.duplicated : Related method on Index, indicating duplicate + Index values. + + Examples + -------- + Generate an pandas.Index with duplicate values. + + >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo']) + + The `keep` parameter controls which duplicate values are removed. + The value 'first' keeps the first occurrence for each + set of duplicated entries. The default value of keep is 'first'. + + >>> idx.drop_duplicates(keep='first') + Index(['lama', 'cow', 'beetle', 'hippo'], dtype='object') + + The value 'last' keeps the last occurrence for each set of duplicated + entries. + + >>> idx.drop_duplicates(keep='last') + Index(['cow', 'beetle', 'lama', 'hippo'], dtype='object') + + The value ``False`` discards all sets of duplicated entries. + + >>> idx.drop_duplicates(keep=False) + Index(['cow', 'beetle', 'hippo'], dtype='object') """ - raise TypeError("cannot do {form} indexing on {klass} with these " - "indexers [{key}] of {kind}".format( - form=form, klass=type(self), key=key, - kind=type(key))) + return super(Index, self).drop_duplicates(keep=keep) + + def duplicated(self, keep='first'): + """ + Indicate duplicate index values. + + Duplicated values are indicated as ``True`` values in the resulting + array. Either all duplicates, all except the first, or all except the + last occurrence of duplicates can be indicated. + + Parameters + ---------- + keep : {'first', 'last', False}, default 'first' + The value or values in a set of duplicates to mark as missing. + + - 'first' : Mark duplicates as ``True`` except for the first + occurrence. + - 'last' : Mark duplicates as ``True`` except for the last + occurrence. + - ``False`` : Mark all duplicates as ``True``. + + Examples + -------- + By default, for each set of duplicated values, the first occurrence is + set to False and all others to True: + + >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama']) + >>> idx.duplicated() + array([False, False, True, False, True]) + + which is equivalent to + + >>> idx.duplicated(keep='first') + array([False, False, True, False, True]) + + By using 'last', the last occurrence of each set of duplicated values + is set on False and all others on True: + + >>> idx.duplicated(keep='last') + array([ True, False, True, False, False]) + + By setting keep on ``False``, all duplicates are True: + + >>> idx.duplicated(keep=False) + array([ True, False, True, False, True]) + + Returns + ------- + numpy.ndarray + + See Also + -------- + pandas.Series.duplicated : Equivalent method on pandas.Series. + pandas.DataFrame.duplicated : Equivalent method on pandas.DataFrame. + pandas.Index.drop_duplicates : Remove duplicate values from Index. + """ + return super(Index, self).duplicated(keep=keep) def get_duplicates(self): """ @@ -1959,97 +2165,65 @@ def get_duplicates(self): return self[self.duplicated()].unique() - def _cleanup(self): - self._engine.clear_mapping() - - @cache_readonly - def _constructor(self): - return type(self) + def _get_unique_index(self, dropna=False): + """ + Returns an index containing unique values. - @cache_readonly - def _engine(self): - # property, for now, slow to look up - return self._engine_type(lambda: self._ndarray_values, len(self)) + Parameters + ---------- + dropna : bool + If True, NaN values are dropped. - def _validate_index_level(self, level): + Returns + ------- + uniques : index """ - Validate index level. + if self.is_unique and not dropna: + return self - For single-level Index getting level number is a no-op, but some - verification must be done like in MultiIndex. + values = self.values - """ - if isinstance(level, int): - if level < 0 and level != -1: - raise IndexError("Too many levels: Index has only 1 level," - " %d is not a valid level number" % (level, )) - elif level > 0: - raise IndexError("Too many levels:" - " Index has only 1 level, not %d" % - (level + 1)) - elif level != self.name: - raise KeyError('Level %s must be same as name (%s)' % - (level, self.name)) + if not self.is_unique: + values = self.unique() - def _get_level_number(self, level): - self._validate_index_level(level) - return 0 + if dropna: + try: + if self.hasnans: + values = values[~isna(values)] + except NotImplementedError: + pass - @cache_readonly - def inferred_type(self): - """ - Return a string of the type inferred from the values. - """ - return lib.infer_dtype(self) + return self._shallow_copy(values) - def _is_memory_usage_qualified(self): - """ - Return a boolean if we need a qualified .info display. - """ - return self.is_object() + # -------------------------------------------------------------------- + # Arithmetic & Logical Methods - def is_type_compatible(self, kind): - return kind == self.inferred_type + def __add__(self, other): + if isinstance(other, (ABCSeries, ABCDataFrame)): + return NotImplemented + return Index(np.array(self) + other) - @cache_readonly - def is_all_dates(self): - if self._data is None: - return False - return is_datetime_array(ensure_object(self.values)) + def __radd__(self, other): + return Index(other + np.array(self)) - def __reduce__(self): - d = dict(data=self._data) - d.update(self._get_attributes_dict()) - return _new_Index, (self.__class__, d), None + def __iadd__(self, other): + # alias for __add__ + return self + other - def __setstate__(self, state): - """ - Necessary for making this object picklable. - """ + def __sub__(self, other): + return Index(np.array(self) - other) - if isinstance(state, dict): - self._data = state.pop('data') - for k, v in compat.iteritems(state): - setattr(self, k, v) + def __rsub__(self, other): + return Index(other - np.array(self)) - elif isinstance(state, tuple): + def __and__(self, other): + return self.intersection(other) - if len(state) == 2: - nd_state, own_state = state - data = np.empty(nd_state[1], dtype=nd_state[2]) - np.ndarray.__setstate__(data, nd_state) - self.name = own_state[0] + def __or__(self, other): + return self.union(other) - else: # pragma: no cover - data = np.empty(state) - np.ndarray.__setstate__(data, state) - - self._data = data - self._reset_identity() - else: - raise Exception("invalid pickle state") - - _unpickle_compat = __setstate__ + def __xor__(self, other): + return self.symmetric_difference(other) def __nonzero__(self): raise ValueError("The truth value of a {0} is ambiguous. " @@ -2058,2236 +2232,2302 @@ def __nonzero__(self): __bool__ = __nonzero__ - _index_shared_docs['__contains__'] = """ - Return a boolean if this key is IN the index. - - Parameters - ---------- - key : object + # -------------------------------------------------------------------- + # Set Operation Methods - Returns - ------- - boolean + def _get_reconciled_name_object(self, other): """ + If the result of a set operation will be self, + return self, unless the name changes, in which + case make a shallow copy of self. + """ + name = get_op_result_name(self, other) + if self.name != name: + return self._shallow_copy(name=name) + return self - @Appender(_index_shared_docs['__contains__'] % _index_doc_kwargs) - def __contains__(self, key): - hash(key) - try: - return key in self._engine - except (OverflowError, TypeError, ValueError): - return False - - _index_shared_docs['contains'] = """ - Return a boolean if this key is IN the index. + def union(self, other): + """ + Form the union of two Index objects and sorts if possible. Parameters ---------- - key : object + other : Index or array-like Returns ------- - boolean + union : Index + + Examples + -------- + + >>> idx1 = pd.Index([1, 2, 3, 4]) + >>> idx2 = pd.Index([3, 4, 5, 6]) + >>> idx1.union(idx2) + Int64Index([1, 2, 3, 4, 5, 6], dtype='int64') """ + self._assert_can_do_setop(other) + other = ensure_index(other) - @Appender(_index_shared_docs['contains'] % _index_doc_kwargs) - def contains(self, key): - hash(key) - try: - return key in self._engine - except (TypeError, ValueError): - return False + if len(other) == 0 or self.equals(other): + return self._get_reconciled_name_object(other) - def __hash__(self): - raise TypeError("unhashable type: %r" % type(self).__name__) + if len(self) == 0: + return other._get_reconciled_name_object(self) - def __setitem__(self, key, value): - raise TypeError("Index does not support mutable operations") + # TODO: is_dtype_union_equal is a hack around + # 1. buggy set ops with duplicates (GH #13432) + # 2. CategoricalIndex lacking setops (GH #10186) + # Once those are fixed, this workaround can be removed + if not is_dtype_union_equal(self.dtype, other.dtype): + this = self.astype('O') + other = other.astype('O') + return this.union(other) - def __getitem__(self, key): - """ - Override numpy.ndarray's __getitem__ method to work as desired. + # TODO(EA): setops-refactor, clean all this up + if is_period_dtype(self) or is_datetime64tz_dtype(self): + lvals = self._ndarray_values + else: + lvals = self._values + if is_period_dtype(other) or is_datetime64tz_dtype(other): + rvals = other._ndarray_values + else: + rvals = other._values - This function adds lists and Series as valid boolean indexers - (ndarrays only supports ndarray with dtype=bool). + if self.is_monotonic and other.is_monotonic: + try: + result = self._outer_indexer(lvals, rvals)[0] + except TypeError: + # incomparable objects + result = list(lvals) - If resulting ndim != 1, plain ndarray is returned instead of - corresponding `Index` subclass. + # worth making this faster? a very unusual case + value_set = set(lvals) + result.extend([x for x in rvals if x not in value_set]) + else: + indexer = self.get_indexer(other) + indexer, = (indexer == -1).nonzero() - """ - # There's no custom logic to be implemented in __getslice__, so it's - # not overloaded intentionally. - getitem = self._data.__getitem__ - promote = self._shallow_copy + if len(indexer) > 0: + other_diff = algos.take_nd(rvals, indexer, + allow_fill=False) + result = _concat._concat_compat((lvals, other_diff)) - if is_scalar(key): - key = com.cast_scalar_indexer(key) - return getitem(key) + try: + lvals[0] < other_diff[0] + except TypeError as e: + warnings.warn("%s, sort order is undefined for " + "incomparable objects" % e, RuntimeWarning, + stacklevel=3) + else: + types = frozenset((self.inferred_type, + other.inferred_type)) + if not types & _unsortable_types: + result.sort() - if isinstance(key, slice): - # This case is separated from the conditional above to avoid - # pessimization of basic indexing. - return promote(getitem(key)) + else: + result = lvals - if com.is_bool_indexer(key): - key = np.asarray(key, dtype=bool) + try: + result = np.sort(result) + except TypeError as e: + warnings.warn("%s, sort order is undefined for " + "incomparable objects" % e, RuntimeWarning, + stacklevel=3) - key = com.values_from_object(key) - result = getitem(key) - if not is_scalar(result): - return promote(result) - else: - return result + # for subclasses + return self._wrap_setop_result(other, result) - def _can_hold_identifiers_and_holds_name(self, name): - """ - Faster check for ``name in self`` when we know `name` is a Python - identifier (e.g. in NDFrame.__getattr__, which hits this to support - . key lookup). For indexes that can't hold identifiers (everything - but object & categorical) we just return False. + def _wrap_setop_result(self, other, result): + return self._constructor(result, name=get_op_result_name(self, other)) - https://github.com/pandas-dev/pandas/issues/19764 + def intersection(self, other): """ - if self.is_object() or self.is_categorical(): - return name in self - return False + Form the intersection of two Index objects. - def append(self, other): - """ - Append a collection of Index options together. + This returns a new Index with elements common to the index and `other`, + preserving the order of the calling index. Parameters ---------- - other : Index or list/tuple of indices + other : Index or array-like Returns ------- - appended : Index - """ + intersection : Index - to_concat = [self] + Examples + -------- - if isinstance(other, (list, tuple)): - to_concat = to_concat + list(other) - else: - to_concat.append(other) + >>> idx1 = pd.Index([1, 2, 3, 4]) + >>> idx2 = pd.Index([3, 4, 5, 6]) + >>> idx1.intersection(idx2) + Int64Index([3, 4], dtype='int64') + """ + self._assert_can_do_setop(other) + other = ensure_index(other) - for obj in to_concat: - if not isinstance(obj, Index): - raise TypeError('all inputs must be Index') + if self.equals(other): + return self._get_reconciled_name_object(other) - names = {obj.name for obj in to_concat} - name = None if len(names) > 1 else self.name + if not is_dtype_equal(self.dtype, other.dtype): + this = self.astype('O') + other = other.astype('O') + return this.intersection(other) - return self._concat(to_concat, name) + # TODO(EA): setops-refactor, clean all this up + if is_period_dtype(self): + lvals = self._ndarray_values + else: + lvals = self._values + if is_period_dtype(other): + rvals = other._ndarray_values + else: + rvals = other._values - def _concat(self, to_concat, name): + if self.is_monotonic and other.is_monotonic: + try: + result = self._inner_indexer(lvals, rvals)[0] + return self._wrap_setop_result(other, result) + except TypeError: + pass - typs = _concat.get_dtype_kinds(to_concat) + try: + indexer = Index(rvals).get_indexer(lvals) + indexer = indexer.take((indexer != -1).nonzero()[0]) + except Exception: + # duplicates + indexer = algos.unique1d( + Index(rvals).get_indexer_non_unique(lvals)[0]) + indexer = indexer[indexer != -1] - if len(typs) == 1: - return self._concat_same_dtype(to_concat, name=name) - return _concat._concat_index_asobject(to_concat, name=name) + taken = other.take(indexer) + if self.name != other.name: + taken.name = None + return taken - def _concat_same_dtype(self, to_concat, name): - """ - Concatenate to_concat which has the same class. + def difference(self, other, sort=True): """ - # must be overridden in specific classes - return _concat._concat_index_asobject(to_concat, name) - - _index_shared_docs['take'] = """ - Return a new %(klass)s of the values selected by the indices. + Return a new Index with elements from the index that are not in + `other`. - For internal compatibility with numpy arrays. + This is the set difference of two Index objects. Parameters ---------- - indices : list - Indices to be taken - axis : int, optional - The axis over which to select values, always 0. - allow_fill : bool, default True - fill_value : bool, default None - If allow_fill=True and fill_value is not None, indices specified by - -1 is regarded as NA. If Index doesn't hold NA, raise ValueError + other : Index or array-like + sort : bool, default True + Sort the resulting index if possible - See Also + .. versionadded:: 0.24.0 + + Returns + ------- + difference : Index + + Examples -------- - numpy.ndarray.take + + >>> idx1 = pd.Index([2, 1, 3, 4]) + >>> idx2 = pd.Index([3, 4, 5, 6]) + >>> idx1.difference(idx2) + Int64Index([1, 2], dtype='int64') + >>> idx1.difference(idx2, sort=False) + Int64Index([2, 1], dtype='int64') """ + self._assert_can_do_setop(other) - @Appender(_index_shared_docs['take'] % _index_doc_kwargs) - def take(self, indices, axis=0, allow_fill=True, - fill_value=None, **kwargs): - if kwargs: - nv.validate_take(tuple(), kwargs) - indices = ensure_platform_int(indices) - if self._can_hold_na: - taken = self._assert_take_fillable(self.values, indices, - allow_fill=allow_fill, - fill_value=fill_value, - na_value=self._na_value) - else: - if allow_fill and fill_value is not None: - msg = 'Unable to fill values because {0} cannot contain NA' - raise ValueError(msg.format(self.__class__.__name__)) - taken = self.values.take(indices) - return self._shallow_copy(taken) + if self.equals(other): + # pass an empty np.ndarray with the appropriate dtype + return self._shallow_copy(self._data[:0]) - def _assert_take_fillable(self, values, indices, allow_fill=True, - fill_value=None, na_value=np.nan): - """ - Internal method to handle NA filling of take. - """ - indices = ensure_platform_int(indices) + other, result_name = self._convert_can_do_setop(other) - # only fill if we are passing a non-None fill_value - if allow_fill and fill_value is not None: - if (indices < -1).any(): - msg = ('When allow_fill=True and fill_value is not None, ' - 'all indices must be >= -1') - raise ValueError(msg) - taken = algos.take(values, - indices, - allow_fill=allow_fill, - fill_value=na_value) - else: - taken = values.take(indices) - return taken + this = self._get_unique_index() - @cache_readonly - def _isnan(self): - """ - Return if each value is NaN. - """ - if self._can_hold_na: - return isna(self) - else: - # shouldn't reach to this condition by checking hasnans beforehand - values = np.empty(len(self), dtype=np.bool_) - values.fill(False) - return values + indexer = this.get_indexer(other) + indexer = indexer.take((indexer != -1).nonzero()[0]) - @cache_readonly - def _nan_idxs(self): - if self._can_hold_na: - w, = self._isnan.nonzero() - return w - else: - return np.array([], dtype=np.int64) + label_diff = np.setdiff1d(np.arange(this.size), indexer, + assume_unique=True) + the_diff = this.values.take(label_diff) + if sort: + try: + the_diff = sorting.safe_sort(the_diff) + except TypeError: + pass - @cache_readonly - def hasnans(self): - """ - Return if I have any nans; enables various perf speedups. - """ - if self._can_hold_na: - return bool(self._isnan.any()) - else: - return False + return this._shallow_copy(the_diff, name=result_name, freq=None) - def isna(self): + def symmetric_difference(self, other, result_name=None): """ - Detect missing values. + Compute the symmetric difference of two Index objects. - Return a boolean same-sized object indicating if the values are NA. - NA values, such as ``None``, :attr:`numpy.NaN` or :attr:`pd.NaT`, get - mapped to ``True`` values. - Everything else get mapped to ``False`` values. Characters such as - empty strings `''` or :attr:`numpy.inf` are not considered NA values - (unless you set ``pandas.options.mode.use_inf_as_na = True``). + It's sorted if sorting is possible. - .. versionadded:: 0.20.0 + Parameters + ---------- + other : Index or array-like + result_name : str Returns ------- - numpy.ndarray - A boolean array of whether my values are NA + symmetric_difference : Index - See Also - -------- - pandas.Index.notna : Boolean inverse of isna. - pandas.Index.dropna : Omit entries with missing values. - pandas.isna : Top-level isna. - Series.isna : Detect missing values in Series object. + Notes + ----- + ``symmetric_difference`` contains elements that appear in either + ``idx1`` or ``idx2`` but not both. Equivalent to the Index created by + ``idx1.difference(idx2) | idx2.difference(idx1)`` with duplicates + dropped. Examples -------- - Show which entries in a pandas.Index are NA. The result is an - array. + >>> idx1 = pd.Index([1, 2, 3, 4]) + >>> idx2 = pd.Index([2, 3, 4, 5]) + >>> idx1.symmetric_difference(idx2) + Int64Index([1, 5], dtype='int64') - >>> idx = pd.Index([5.2, 6.0, np.NaN]) - >>> idx - Float64Index([5.2, 6.0, nan], dtype='float64') - >>> idx.isna() - array([False, False, True], dtype=bool) + You can also use the ``^`` operator: - Empty strings are not considered NA values. None is considered an NA - value. + >>> idx1 ^ idx2 + Int64Index([1, 5], dtype='int64') + """ + self._assert_can_do_setop(other) + other, result_name_update = self._convert_can_do_setop(other) + if result_name is None: + result_name = result_name_update - >>> idx = pd.Index(['black', '', 'red', None]) - >>> idx - Index(['black', '', 'red', None], dtype='object') - >>> idx.isna() - array([False, False, False, True], dtype=bool) + this = self._get_unique_index() + other = other._get_unique_index() + indexer = this.get_indexer(other) - For datetimes, `NaT` (Not a Time) is considered as an NA value. + # {this} minus {other} + common_indexer = indexer.take((indexer != -1).nonzero()[0]) + left_indexer = np.setdiff1d(np.arange(this.size), common_indexer, + assume_unique=True) + left_diff = this.values.take(left_indexer) - >>> idx = pd.DatetimeIndex([pd.Timestamp('1940-04-25'), - ... pd.Timestamp(''), None, pd.NaT]) - >>> idx - DatetimeIndex(['1940-04-25', 'NaT', 'NaT', 'NaT'], - dtype='datetime64[ns]', freq=None) - >>> idx.isna() - array([False, True, True, True], dtype=bool) - """ - return self._isnan - isnull = isna + # {other} minus {this} + right_indexer = (indexer == -1).nonzero()[0] + right_diff = other.values.take(right_indexer) - def notna(self): - """ - Detect existing (non-missing) values. + the_diff = _concat._concat_compat([left_diff, right_diff]) + try: + the_diff = sorting.safe_sort(the_diff) + except TypeError: + pass - Return a boolean same-sized object indicating if the values are not NA. - Non-missing values get mapped to ``True``. Characters such as empty - strings ``''`` or :attr:`numpy.inf` are not considered NA values - (unless you set ``pandas.options.mode.use_inf_as_na = True``). - NA values, such as None or :attr:`numpy.NaN`, get mapped to ``False`` - values. + attribs = self._get_attributes_dict() + attribs['name'] = result_name + if 'freq' in attribs: + attribs['freq'] = None + return self._shallow_copy_with_infer(the_diff, **attribs) - .. versionadded:: 0.20.0 + def _assert_can_do_setop(self, other): + if not is_list_like(other): + raise TypeError('Input must be Index or array-like') + return True - Returns - ------- - numpy.ndarray - Boolean array to indicate which entries are not NA. + def _convert_can_do_setop(self, other): + if not isinstance(other, Index): + other = Index(other, name=self.name) + result_name = self.name + else: + result_name = get_op_result_name(self, other) + return other, result_name - See Also - -------- - Index.notnull : Alias of notna. - Index.isna: Inverse of notna. - pandas.notna : Top-level notna. + # -------------------------------------------------------------------- + # Indexing Methods - Examples - -------- - Show which entries in an Index are not NA. The result is an - array. + _index_shared_docs['get_loc'] = """ + Get integer location, slice or boolean mask for requested label. - >>> idx = pd.Index([5.2, 6.0, np.NaN]) - >>> idx - Float64Index([5.2, 6.0, nan], dtype='float64') - >>> idx.notna() - array([ True, True, False]) + Parameters + ---------- + key : label + method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional + * default: exact matches only. + * pad / ffill: find the PREVIOUS index value if no exact match. + * backfill / bfill: use NEXT index value if no exact match + * nearest: use the NEAREST index value if no exact match. Tied + distances are broken by preferring the larger index value. + tolerance : optional + Maximum distance from index value for inexact matches. The value of + the index at the matching location most satisfy the equation + ``abs(index[loc] - key) <= tolerance``. - Empty strings are not considered NA values. None is considered a NA - value. + Tolerance may be a scalar + value, which applies the same tolerance to all values, or + list-like, which applies variable tolerance per element. List-like + includes list, tuple, array, Series, and must be the same size as + the index and its dtype must exactly match the index's type. - >>> idx = pd.Index(['black', '', 'red', None]) - >>> idx - Index(['black', '', 'red', None], dtype='object') - >>> idx.notna() - array([ True, True, True, False]) - """ - return ~self.isna() - notnull = notna + .. versionadded:: 0.21.0 (list-like tolerance) - def putmask(self, mask, value): - """ - Return a new Index of the values set with the mask. + Returns + ------- + loc : int if unique index, slice if monotonic index, else mask - See Also - -------- - numpy.ndarray.putmask - """ - values = self.values.copy() - try: - np.putmask(values, mask, self._convert_for_op(value)) - return self._shallow_copy(values) - except (ValueError, TypeError) as err: - if is_object_dtype(self): - raise err + Examples + --------- + >>> unique_index = pd.Index(list('abc')) + >>> unique_index.get_loc('b') + 1 - # coerces to object - return self.astype(object).putmask(mask, value) + >>> monotonic_index = pd.Index(list('abbc')) + >>> monotonic_index.get_loc('b') + slice(1, 3, None) - def format(self, name=False, formatter=None, **kwargs): - """ - Render a string representation of the Index. + >>> non_monotonic_index = pd.Index(list('abcb')) + >>> non_monotonic_index.get_loc('b') + array([False, True, False, True], dtype=bool) """ - header = [] - if name: - header.append(pprint_thing(self.name, - escape_chars=('\t', '\r', '\n')) if - self.name is not None else '') - if formatter is not None: - return header + list(self.map(formatter)) + @Appender(_index_shared_docs['get_loc']) + def get_loc(self, key, method=None, tolerance=None): + if method is None: + if tolerance is not None: + raise ValueError('tolerance argument only valid if using pad, ' + 'backfill or nearest lookups') + try: + return self._engine.get_loc(key) + except KeyError: + return self._engine.get_loc(self._maybe_cast_indexer(key)) + indexer = self.get_indexer([key], method=method, tolerance=tolerance) + if indexer.ndim > 1 or indexer.size > 1: + raise TypeError('get_loc requires scalar valued input') + loc = indexer.item() + if loc == -1: + raise KeyError(key) + return loc - return self._format_with_header(header, **kwargs) + _index_shared_docs['get_indexer'] = """ + Compute indexer and mask for new index given the current index. The + indexer should be then used as an input to ndarray.take to align the + current data to the new index. - def _format_with_header(self, header, na_rep='NaN', **kwargs): - values = self.values + Parameters + ---------- + target : %(target_klass)s + method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional + * default: exact matches only. + * pad / ffill: find the PREVIOUS index value if no exact match. + * backfill / bfill: use NEXT index value if no exact match + * nearest: use the NEAREST index value if no exact match. Tied + distances are broken by preferring the larger index value. + limit : int, optional + Maximum number of consecutive labels in ``target`` to match for + inexact matches. + tolerance : optional + Maximum distance between original and new labels for inexact + matches. The values of the index at the matching locations most + satisfy the equation ``abs(index[indexer] - target) <= tolerance``. - from pandas.io.formats.format import format_array + Tolerance may be a scalar value, which applies the same tolerance + to all values, or list-like, which applies variable tolerance per + element. List-like includes list, tuple, array, Series, and must be + the same size as the index and its dtype must exactly match the + index's type. - if is_categorical_dtype(values.dtype): - values = np.array(values) + .. versionadded:: 0.21.0 (list-like tolerance) - elif is_object_dtype(values.dtype): - values = lib.maybe_convert_objects(values, safe=1) + Returns + ------- + indexer : ndarray of int + Integers from 0 to n - 1 indicating that the index at these + positions matches the corresponding target values. Missing values + in the target are marked by -1. - if is_object_dtype(values.dtype): - result = [pprint_thing(x, escape_chars=('\t', '\r', '\n')) - for x in values] + Examples + -------- + >>> index = pd.Index(['c', 'a', 'b']) + >>> index.get_indexer(['a', 'b', 'x']) + array([ 1, 2, -1]) - # could have nans - mask = isna(values) - if mask.any(): - result = np.array(result) - result[mask] = na_rep - result = result.tolist() + Notice that the return value is an array of locations in ``index`` + and ``x`` is marked by -1, as it is not in ``index``. + """ - else: - result = _trim_front(format_array(values, None, justify='left')) - return header + result + @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) + def get_indexer(self, target, method=None, limit=None, tolerance=None): + method = missing.clean_reindex_fill_method(method) + target = ensure_index(target) + if tolerance is not None: + tolerance = self._convert_tolerance(tolerance, target) - def to_native_types(self, slicer=None, **kwargs): - """ - Format specified values of `self` and return them. + # Treat boolean labels passed to a numeric index as not found. Without + # this fix False and True would be treated as 0 and 1 respectively. + # (GH #16877) + if target.is_boolean() and self.is_numeric(): + return ensure_platform_int(np.repeat(-1, target.size)) - Parameters - ---------- - slicer : int, array-like - An indexer into `self` that specifies which values - are used in the formatting process. - kwargs : dict - Options for specifying how the values should be formatted. - These options include the following: + pself, ptarget = self._maybe_promote(target) + if pself is not self or ptarget is not target: + return pself.get_indexer(ptarget, method=method, limit=limit, + tolerance=tolerance) - 1) na_rep : str - The value that serves as a placeholder for NULL values - 2) quoting : bool or None - Whether or not there are quoted values in `self` - 3) date_format : str - The format used to represent date-like values - """ + if not is_dtype_equal(self.dtype, target.dtype): + this = self.astype(object) + target = target.astype(object) + return this.get_indexer(target, method=method, limit=limit, + tolerance=tolerance) - values = self - if slicer is not None: - values = values[slicer] - return values._format_native_types(**kwargs) + if not self.is_unique: + raise InvalidIndexError('Reindexing only valid with uniquely' + ' valued Index objects') - def _format_native_types(self, na_rep='', quoting=None, **kwargs): - """ - Actually format specific types of the index. - """ - mask = isna(self) - if not self.is_object() and not quoting: - values = np.asarray(self).astype(str) + if method == 'pad' or method == 'backfill': + indexer = self._get_fill_indexer(target, method, limit, tolerance) + elif method == 'nearest': + indexer = self._get_nearest_indexer(target, limit, tolerance) else: - values = np.array(self, dtype=object, copy=True) - - values[mask] = na_rep - return values + if tolerance is not None: + raise ValueError('tolerance argument only valid if doing pad, ' + 'backfill or nearest reindexing') + if limit is not None: + raise ValueError('limit argument only valid if doing pad, ' + 'backfill or nearest reindexing') - def equals(self, other): - """ - Determines if two Index objects contain the same elements. - """ - if self.is_(other): - return True + indexer = self._engine.get_indexer(target._ndarray_values) - if not isinstance(other, Index): - return False + return ensure_platform_int(indexer) - if is_object_dtype(self) and not is_object_dtype(other): - # if other is not object, use other's logic for coercion - return other.equals(self) + def _convert_tolerance(self, tolerance, target): + # override this method on subclasses + tolerance = np.asarray(tolerance) + if target.size != tolerance.size and tolerance.size > 1: + raise ValueError('list-like tolerance size must match ' + 'target index size') + return tolerance - try: - return array_equivalent(com.values_from_object(self), - com.values_from_object(other)) - except Exception: - return False + def _get_fill_indexer(self, target, method, limit=None, tolerance=None): + if self.is_monotonic_increasing and target.is_monotonic_increasing: + method = (self._engine.get_pad_indexer if method == 'pad' else + self._engine.get_backfill_indexer) + indexer = method(target._ndarray_values, limit) + else: + indexer = self._get_fill_indexer_searchsorted(target, method, + limit) + if tolerance is not None: + indexer = self._filter_indexer_tolerance(target._ndarray_values, + indexer, + tolerance) + return indexer - def identical(self, other): - """ - Similar to equals, but check that other comparable attributes are - also equal. + def _get_fill_indexer_searchsorted(self, target, method, limit=None): """ - return (self.equals(other) and - all((getattr(self, c, None) == getattr(other, c, None) - for c in self._comparables)) and - type(self) == type(other)) - - def asof(self, label): + Fallback pad/backfill get_indexer that works for monotonic decreasing + indexes and non-monotonic targets. """ - Return the label from the index, or, if not present, the previous one. - - Assuming that the index is sorted, return the passed index label if it - is in the index, or return the previous index label if the passed one - is not in the index. + if limit is not None: + raise ValueError('limit argument for %r method only well-defined ' + 'if index and target are monotonic' % method) - Parameters - ---------- - label : object - The label up to which the method returns the latest index label. + side = 'left' if method == 'pad' else 'right' - Returns - ------- - object - The passed label if it is in the index. The previous label if the - passed label is not in the sorted index or `NaN` if there is no - such label. - - See Also - -------- - Series.asof : Return the latest value in a Series up to the - passed index. - merge_asof : Perform an asof merge (similar to left join but it - matches on nearest key rather than equal key). - Index.get_loc : An `asof` is a thin wrapper around `get_loc` - with method='pad'. - - Examples - -------- - `Index.asof` returns the latest index label up to the passed label. + # find exact matches first (this simplifies the algorithm) + indexer = self.get_indexer(target) + nonexact = (indexer == -1) + indexer[nonexact] = self._searchsorted_monotonic(target[nonexact], + side) + if side == 'left': + # searchsorted returns "indices into a sorted array such that, + # if the corresponding elements in v were inserted before the + # indices, the order of a would be preserved". + # Thus, we need to subtract 1 to find values to the left. + indexer[nonexact] -= 1 + # This also mapped not found values (values of 0 from + # np.searchsorted) to -1, which conveniently is also our + # sentinel for missing values + else: + # Mark indices to the right of the largest value as not found + indexer[indexer == len(self)] = -1 + return indexer - >>> idx = pd.Index(['2013-12-31', '2014-01-02', '2014-01-03']) - >>> idx.asof('2014-01-01') - '2013-12-31' + def _get_nearest_indexer(self, target, limit, tolerance): + """ + Get the indexer for the nearest index labels; requires an index with + values that can be subtracted from each other (e.g., not strings or + tuples). + """ + left_indexer = self.get_indexer(target, 'pad', limit=limit) + right_indexer = self.get_indexer(target, 'backfill', limit=limit) - If the label is in the index, the method returns the passed label. + target = np.asarray(target) + left_distances = abs(self.values[left_indexer] - target) + right_distances = abs(self.values[right_indexer] - target) - >>> idx.asof('2014-01-02') - '2014-01-02' + op = operator.lt if self.is_monotonic_increasing else operator.le + indexer = np.where(op(left_distances, right_distances) | + (right_indexer == -1), left_indexer, right_indexer) + if tolerance is not None: + indexer = self._filter_indexer_tolerance(target, indexer, + tolerance) + return indexer - If all of the labels in the index are later than the passed label, - NaN is returned. + def _filter_indexer_tolerance(self, target, indexer, tolerance): + distance = abs(self.values[indexer] - target) + indexer = np.where(distance <= tolerance, indexer, -1) + return indexer - >>> idx.asof('1999-01-02') - nan + # -------------------------------------------------------------------- + # Indexer Conversion Methods - If the index is not sorted, an error is raised. + _index_shared_docs['_convert_scalar_indexer'] = """ + Convert a scalar indexer. - >>> idx_not_sorted = pd.Index(['2013-12-31', '2015-01-02', - ... '2014-01-03']) - >>> idx_not_sorted.asof('2013-12-31') - Traceback (most recent call last): - ValueError: index must be monotonic increasing or decreasing - """ - try: - loc = self.get_loc(label, method='pad') - except KeyError: - return self._na_value - else: - if isinstance(loc, slice): - loc = loc.indices(len(self))[-1] - return self[loc] + Parameters + ---------- + key : label of the slice bound + kind : {'ix', 'loc', 'getitem', 'iloc'} or None + """ - def asof_locs(self, where, mask): - """ - Finds the locations (indices) of the labels from the index for - every entry in the `where` argument. + @Appender(_index_shared_docs['_convert_scalar_indexer']) + def _convert_scalar_indexer(self, key, kind=None): + assert kind in ['ix', 'loc', 'getitem', 'iloc', None] - As in the `asof` function, if the label (a particular entry in - `where`) is not in the index, the latest index label upto the - passed label is chosen and its index returned. + if kind == 'iloc': + return self._validate_indexer('positional', key, kind) - If all of the labels in the index are later than a label in `where`, - -1 is returned. + if len(self) and not isinstance(self, ABCMultiIndex,): - `mask` is used to ignore NA values in the index during calculation. + # we can raise here if we are definitive that this + # is positional indexing (eg. .ix on with a float) + # or label indexing if we are using a type able + # to be represented in the index - Parameters - ---------- - where : Index - An Index consisting of an array of timestamps. - mask : array-like - Array of booleans denoting where values in the original - data are not NA. + if kind in ['getitem', 'ix'] and is_float(key): + if not self.is_floating(): + return self._invalid_indexer('label', key) - Returns - ------- - numpy.ndarray - An array of locations (indices) of the labels from the Index - which correspond to the return values of the `asof` function - for every element in `where`. - """ - locs = self.values[mask].searchsorted(where.values, side='right') - locs = np.where(locs > 0, locs - 1, 0) + elif kind in ['loc'] and is_float(key): - result = np.arange(len(self))[mask].take(locs) + # we want to raise KeyError on string/mixed here + # technically we *could* raise a TypeError + # on anything but mixed though + if self.inferred_type not in ['floating', + 'mixed-integer-float', + 'string', + 'unicode', + 'mixed']: + return self._invalid_indexer('label', key) - first = mask.argmax() - result[(locs == 0) & (where.values < self.values[first])] = -1 + elif kind in ['loc'] and is_integer(key): + if not self.holds_integer(): + return self._invalid_indexer('label', key) - return result + return key - def sort_values(self, return_indexer=False, ascending=True): - """ - Return a sorted copy of the index. + _index_shared_docs['_convert_slice_indexer'] = """ + Convert a slice indexer. - Return a sorted copy of the index, and optionally return the indices - that sorted the index itself. + By definition, these are labels unless 'iloc' is passed in. + Floats are not allowed as the start, step, or stop of the slice. Parameters ---------- - return_indexer : bool, default False - Should the indices that would sort the index be returned. - ascending : bool, default True - Should the index values be sorted in an ascending order. + key : label of the slice bound + kind : {'ix', 'loc', 'getitem', 'iloc'} or None + """ - Returns - ------- - sorted_index : pandas.Index - Sorted copy of the index. - indexer : numpy.ndarray, optional - The indices that the index itself was sorted by. + @Appender(_index_shared_docs['_convert_slice_indexer']) + def _convert_slice_indexer(self, key, kind=None): + assert kind in ['ix', 'loc', 'getitem', 'iloc', None] - See Also - -------- - pandas.Series.sort_values : Sort values of a Series. - pandas.DataFrame.sort_values : Sort values in a DataFrame. + # if we are not a slice, then we are done + if not isinstance(key, slice): + return key - Examples - -------- - >>> idx = pd.Index([10, 100, 1, 1000]) - >>> idx - Int64Index([10, 100, 1, 1000], dtype='int64') + # validate iloc + if kind == 'iloc': + return slice(self._validate_indexer('slice', key.start, kind), + self._validate_indexer('slice', key.stop, kind), + self._validate_indexer('slice', key.step, kind)) - Sort values in ascending order (default behavior). + # potentially cast the bounds to integers + start, stop, step = key.start, key.stop, key.step - >>> idx.sort_values() - Int64Index([1, 10, 100, 1000], dtype='int64') + # figure out if this is a positional indexer + def is_int(v): + return v is None or is_integer(v) - Sort values in descending order, and also get the indices `idx` was - sorted by. + is_null_slicer = start is None and stop is None + is_index_slice = is_int(start) and is_int(stop) + is_positional = is_index_slice and not self.is_integer() - >>> idx.sort_values(ascending=False, return_indexer=True) - (Int64Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2])) - """ - _as = self.argsort() - if not ascending: - _as = _as[::-1] + if kind == 'getitem': + """ + called from the getitem slicers, validate that we are in fact + integers + """ + if self.is_integer() or is_index_slice: + return slice(self._validate_indexer('slice', key.start, kind), + self._validate_indexer('slice', key.stop, kind), + self._validate_indexer('slice', key.step, kind)) - sorted_index = self.take(_as) + # convert the slice to an indexer here - if return_indexer: - return sorted_index, _as + # if we are mixed and have integers + try: + if is_positional and self.is_mixed(): + # Validate start & stop + if start is not None: + self.get_loc(start) + if stop is not None: + self.get_loc(stop) + is_positional = False + except KeyError: + if self.inferred_type == 'mixed-integer-float': + raise + + if is_null_slicer: + indexer = key + elif is_positional: + indexer = key else: - return sorted_index + try: + indexer = self.slice_indexer(start, stop, step, kind=kind) + except Exception: + if is_index_slice: + if self.is_integer(): + raise + else: + indexer = key + else: + raise - def sort(self, *args, **kwargs): - raise TypeError("cannot sort an Index object in-place, use " - "sort_values instead") + return indexer - def sortlevel(self, level=None, ascending=True, sort_remaining=None): + def _convert_listlike_indexer(self, keyarr, kind=None): """ - For internal compatibility with with the Index API. - - Sort the Index. This is for compat with MultiIndex - Parameters ---------- - ascending : boolean, default True - False to sort in descending order - - level, sort_remaining are compat parameters + keyarr : list-like + Indexer to convert. Returns ------- - sorted_index : Index + tuple (indexer, keyarr) + indexer is an ndarray or None if cannot convert + keyarr are tuple-safe keys """ - return self.sort_values(return_indexer=True, ascending=ascending) + if isinstance(keyarr, Index): + keyarr = self._convert_index_indexer(keyarr) + else: + keyarr = self._convert_arr_indexer(keyarr) - def shift(self, periods=1, freq=None): - """ - Shift index by desired number of time frequency increments. + indexer = self._convert_list_indexer(keyarr, kind=kind) + return indexer, keyarr - This method is for shifting the values of datetime-like indexes - by a specified time increment a given number of times. + _index_shared_docs['_convert_arr_indexer'] = """ + Convert an array-like indexer to the appropriate dtype. Parameters ---------- - periods : int, default 1 - Number of periods (or increments) to shift by, - can be positive or negative. - freq : pandas.DateOffset, pandas.Timedelta or string, optional - Frequency increment to shift by. - If None, the index is shifted by its own `freq` attribute. - Offset aliases are valid strings, e.g., 'D', 'W', 'M' etc. + keyarr : array-like + Indexer to convert. Returns ------- - pandas.Index - shifted index - - See Also - -------- - Series.shift : Shift values of Series. - - Examples - -------- - Put the first 5 month starts of 2011 into an index. - - >>> month_starts = pd.date_range('1/1/2011', periods=5, freq='MS') - >>> month_starts - DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01', '2011-04-01', - '2011-05-01'], - dtype='datetime64[ns]', freq='MS') - - Shift the index by 10 days. - - >>> month_starts.shift(10, freq='D') - DatetimeIndex(['2011-01-11', '2011-02-11', '2011-03-11', '2011-04-11', - '2011-05-11'], - dtype='datetime64[ns]', freq=None) - - The default value of `freq` is the `freq` attribute of the index, - which is 'MS' (month start) in this example. - - >>> month_starts.shift(10) - DatetimeIndex(['2011-11-01', '2011-12-01', '2012-01-01', '2012-02-01', - '2012-03-01'], - dtype='datetime64[ns]', freq='MS') + converted_keyarr : array-like + """ - Notes - ----- - This method is only implemented for datetime-like index classes, - i.e., DatetimeIndex, PeriodIndex and TimedeltaIndex. - """ - raise NotImplementedError("Not supported for type %s" % - type(self).__name__) + @Appender(_index_shared_docs['_convert_arr_indexer']) + def _convert_arr_indexer(self, keyarr): + keyarr = com.asarray_tuplesafe(keyarr) + return keyarr - def argsort(self, *args, **kwargs): - """ - Return the integer indices that would sort the index. + _index_shared_docs['_convert_index_indexer'] = """ + Convert an Index indexer to the appropriate dtype. Parameters ---------- - *args - Passed to `numpy.ndarray.argsort`. - **kwargs - Passed to `numpy.ndarray.argsort`. + keyarr : Index (or sub-class) + Indexer to convert. Returns ------- - numpy.ndarray - Integer indices that would sort the index if used as - an indexer. + converted_keyarr : Index (or sub-class) + """ - See Also - -------- - numpy.argsort : Similar method for NumPy arrays. - Index.sort_values : Return sorted copy of Index. + @Appender(_index_shared_docs['_convert_index_indexer']) + def _convert_index_indexer(self, keyarr): + return keyarr - Examples - -------- - >>> idx = pd.Index(['b', 'a', 'd', 'c']) - >>> idx - Index(['b', 'a', 'd', 'c'], dtype='object') + _index_shared_docs['_convert_list_indexer'] = """ + Convert a list-like indexer to the appropriate dtype. - >>> order = idx.argsort() - >>> order - array([1, 0, 3, 2]) + Parameters + ---------- + keyarr : Index (or sub-class) + Indexer to convert. + kind : iloc, ix, loc, optional - >>> idx[order] - Index(['a', 'b', 'c', 'd'], dtype='object') - """ - result = self.asi8 - if result is None: - result = np.array(self) - return result.argsort(*args, **kwargs) + Returns + ------- + positional indexer or None + """ - def __add__(self, other): - if isinstance(other, (ABCSeries, ABCDataFrame)): - return NotImplemented - return Index(np.array(self) + other) + @Appender(_index_shared_docs['_convert_list_indexer']) + def _convert_list_indexer(self, keyarr, kind=None): + if (kind in [None, 'iloc', 'ix'] and + is_integer_dtype(keyarr) and not self.is_floating() and + not isinstance(keyarr, ABCPeriodIndex)): - def __radd__(self, other): - return Index(other + np.array(self)) + if self.inferred_type == 'mixed-integer': + indexer = self.get_indexer(keyarr) + if (indexer >= 0).all(): + return indexer + # missing values are flagged as -1 by get_indexer and negative + # indices are already converted to positive indices in the + # above if-statement, so the negative flags are changed to + # values outside the range of indices so as to trigger an + # IndexError in maybe_convert_indices + indexer[indexer < 0] = len(self) + from pandas.core.indexing import maybe_convert_indices + return maybe_convert_indices(indexer, len(self)) - def __iadd__(self, other): - # alias for __add__ - return self + other + elif not self.inferred_type == 'integer': + keyarr = np.where(keyarr < 0, len(self) + keyarr, keyarr) + return keyarr - def __sub__(self, other): - return Index(np.array(self) - other) + return None - def __rsub__(self, other): - return Index(other - np.array(self)) + def _invalid_indexer(self, form, key): + """ + Consistent invalid indexer message. + """ + raise TypeError("cannot do {form} indexing on {klass} with these " + "indexers [{key}] of {kind}".format( + form=form, klass=type(self), key=key, + kind=type(key))) - def __and__(self, other): - return self.intersection(other) + # -------------------------------------------------------------------- + # Reindex Methods - def __or__(self, other): - return self.union(other) + def _can_reindex(self, indexer): + """ + Check if we are allowing reindexing with this particular indexer. - def __xor__(self, other): - return self.symmetric_difference(other) + Parameters + ---------- + indexer : an integer indexer - def _get_reconciled_name_object(self, other): - """ - If the result of a set operation will be self, - return self, unless the name changes, in which - case make a shallow copy of self. + Raises + ------ + ValueError if its a duplicate axis """ - name = get_op_result_name(self, other) - if self.name != name: - return self._shallow_copy(name=name) - return self - def union(self, other): + # trying to reindex on an axis with duplicates + if not self.is_unique and len(indexer): + raise ValueError("cannot reindex from a duplicate axis") + + def reindex(self, target, method=None, level=None, limit=None, + tolerance=None): """ - Form the union of two Index objects and sorts if possible. + Create index with target's values (move/add/delete values + as necessary). Parameters ---------- - other : Index or array-like + target : an iterable Returns ------- - union : Index - - Examples - -------- + new_index : pd.Index + Resulting index + indexer : np.ndarray or None + Indices of output values in original index - >>> idx1 = pd.Index([1, 2, 3, 4]) - >>> idx2 = pd.Index([3, 4, 5, 6]) - >>> idx1.union(idx2) - Int64Index([1, 2, 3, 4, 5, 6], dtype='int64') """ - self._assert_can_do_setop(other) - other = ensure_index(other) - - if len(other) == 0 or self.equals(other): - return self._get_reconciled_name_object(other) - - if len(self) == 0: - return other._get_reconciled_name_object(self) + # GH6552: preserve names when reindexing to non-named target + # (i.e. neither Index nor Series). + preserve_names = not hasattr(target, 'name') - # TODO: is_dtype_union_equal is a hack around - # 1. buggy set ops with duplicates (GH #13432) - # 2. CategoricalIndex lacking setops (GH #10186) - # Once those are fixed, this workaround can be removed - if not is_dtype_union_equal(self.dtype, other.dtype): - this = self.astype('O') - other = other.astype('O') - return this.union(other) + # GH7774: preserve dtype/tz if target is empty and not an Index. + target = _ensure_has_len(target) # target may be an iterator - # TODO(EA): setops-refactor, clean all this up - if is_period_dtype(self) or is_datetime64tz_dtype(self): - lvals = self._ndarray_values - else: - lvals = self._values - if is_period_dtype(other) or is_datetime64tz_dtype(other): - rvals = other._ndarray_values + if not isinstance(target, Index) and len(target) == 0: + attrs = self._get_attributes_dict() + attrs.pop('freq', None) # don't preserve freq + values = self._data[:0] # appropriately-dtyped empty array + target = self._simple_new(values, dtype=self.dtype, **attrs) else: - rvals = other._values - - if self.is_monotonic and other.is_monotonic: - try: - result = self._outer_indexer(lvals, rvals)[0] - except TypeError: - # incomparable objects - result = list(lvals) + target = ensure_index(target) - # worth making this faster? a very unusual case - value_set = set(lvals) - result.extend([x for x in rvals if x not in value_set]) + if level is not None: + if method is not None: + raise TypeError('Fill method not supported if level passed') + _, indexer, _ = self._join_level(target, level, how='right', + return_indexers=True) else: - indexer = self.get_indexer(other) - indexer, = (indexer == -1).nonzero() - - if len(indexer) > 0: - other_diff = algos.take_nd(rvals, indexer, - allow_fill=False) - result = _concat._concat_compat((lvals, other_diff)) - - try: - lvals[0] < other_diff[0] - except TypeError as e: - warnings.warn("%s, sort order is undefined for " - "incomparable objects" % e, RuntimeWarning, - stacklevel=3) - else: - types = frozenset((self.inferred_type, - other.inferred_type)) - if not types & _unsortable_types: - result.sort() - + if self.equals(target): + indexer = None else: - result = lvals - try: - result = np.sort(result) - except TypeError as e: - warnings.warn("%s, sort order is undefined for " - "incomparable objects" % e, RuntimeWarning, - stacklevel=3) + if self.is_unique: + indexer = self.get_indexer(target, method=method, + limit=limit, + tolerance=tolerance) + else: + if method is not None or limit is not None: + raise ValueError("cannot reindex a non-unique index " + "with a method or limit") + indexer, missing = self.get_indexer_non_unique(target) - # for subclasses - return self._wrap_setop_result(other, result) + if preserve_names and target.nlevels == 1 and target.name != self.name: + target = target.copy() + target.name = self.name - def _wrap_setop_result(self, other, result): - return self._constructor(result, name=get_op_result_name(self, other)) + return target, indexer - def intersection(self, other): + def _reindex_non_unique(self, target): """ - Form the intersection of two Index objects. - - This returns a new Index with elements common to the index and `other`, - preserving the order of the calling index. + Create a new index with target's values (move/add/delete values as + necessary) use with non-unique Index and a possibly non-unique target. Parameters ---------- - other : Index or array-like + target : an iterable Returns ------- - intersection : Index - - Examples - -------- + new_index : pd.Index + Resulting index + indexer : np.ndarray or None + Indices of output values in original index - >>> idx1 = pd.Index([1, 2, 3, 4]) - >>> idx2 = pd.Index([3, 4, 5, 6]) - >>> idx1.intersection(idx2) - Int64Index([3, 4], dtype='int64') """ - self._assert_can_do_setop(other) - other = ensure_index(other) - if self.equals(other): - return self._get_reconciled_name_object(other) + target = ensure_index(target) + indexer, missing = self.get_indexer_non_unique(target) + check = indexer != -1 + new_labels = self.take(indexer[check]) + new_indexer = None - if not is_dtype_equal(self.dtype, other.dtype): - this = self.astype('O') - other = other.astype('O') - return this.intersection(other) + if len(missing): + length = np.arange(len(indexer)) - # TODO(EA): setops-refactor, clean all this up - if is_period_dtype(self): - lvals = self._ndarray_values - else: - lvals = self._values - if is_period_dtype(other): - rvals = other._ndarray_values - else: - rvals = other._values + missing = ensure_platform_int(missing) + missing_labels = target.take(missing) + missing_indexer = ensure_int64(length[~check]) + cur_labels = self.take(indexer[check]).values + cur_indexer = ensure_int64(length[check]) - if self.is_monotonic and other.is_monotonic: - try: - result = self._inner_indexer(lvals, rvals)[0] - return self._wrap_setop_result(other, result) - except TypeError: - pass + new_labels = np.empty(tuple([len(indexer)]), dtype=object) + new_labels[cur_indexer] = cur_labels + new_labels[missing_indexer] = missing_labels - try: - indexer = Index(rvals).get_indexer(lvals) - indexer = indexer.take((indexer != -1).nonzero()[0]) - except Exception: - # duplicates - indexer = algos.unique1d( - Index(rvals).get_indexer_non_unique(lvals)[0]) - indexer = indexer[indexer != -1] + # a unique indexer + if target.is_unique: - taken = other.take(indexer) - if self.name != other.name: - taken.name = None - return taken + # see GH5553, make sure we use the right indexer + new_indexer = np.arange(len(indexer)) + new_indexer[cur_indexer] = np.arange(len(cur_labels)) + new_indexer[missing_indexer] = -1 - def difference(self, other, sort=True): - """ - Return a new Index with elements from the index that are not in - `other`. + # we have a non_unique selector, need to use the original + # indexer here + else: - This is the set difference of two Index objects. + # need to retake to have the same size as the indexer + indexer[~check] = -1 + + # reset the new indexer to account for the new size + new_indexer = np.arange(len(self.take(indexer))) + new_indexer[~check] = -1 + + new_index = self._shallow_copy_with_infer(new_labels, freq=None) + return new_index, indexer, new_indexer + + # -------------------------------------------------------------------- + # Join Methods + + _index_shared_docs['join'] = """ + Compute join_index and indexers to conform data + structures to the new index. Parameters ---------- - other : Index or array-like - sort : bool, default True - Sort the resulting index if possible + other : Index + how : {'left', 'right', 'inner', 'outer'} + level : int or level name, default None + return_indexers : boolean, default False + sort : boolean, default False + Sort the join keys lexicographically in the result Index. If False, + the order of the join keys depends on the join type (how keyword) - .. versionadded:: 0.24.0 + .. versionadded:: 0.20.0 Returns ------- - difference : Index - - Examples - -------- - - >>> idx1 = pd.Index([2, 1, 3, 4]) - >>> idx2 = pd.Index([3, 4, 5, 6]) - >>> idx1.difference(idx2) - Int64Index([1, 2], dtype='int64') - >>> idx1.difference(idx2, sort=False) - Int64Index([2, 1], dtype='int64') + join_index, (left_indexer, right_indexer) """ - self._assert_can_do_setop(other) - if self.equals(other): - # pass an empty np.ndarray with the appropriate dtype - return self._shallow_copy(self._data[:0]) + @Appender(_index_shared_docs['join']) + def join(self, other, how='left', level=None, return_indexers=False, + sort=False): + from .multi import MultiIndex + self_is_mi = isinstance(self, MultiIndex) + other_is_mi = isinstance(other, MultiIndex) - other, result_name = self._convert_can_do_setop(other) + # try to figure out the join level + # GH3662 + if level is None and (self_is_mi or other_is_mi): - this = self._get_unique_index() + # have the same levels/names so a simple join + if self.names == other.names: + pass + else: + return self._join_multi(other, how=how, + return_indexers=return_indexers) - indexer = this.get_indexer(other) - indexer = indexer.take((indexer != -1).nonzero()[0]) + # join on the level + if level is not None and (self_is_mi or other_is_mi): + return self._join_level(other, level, how=how, + return_indexers=return_indexers) - label_diff = np.setdiff1d(np.arange(this.size), indexer, - assume_unique=True) - the_diff = this.values.take(label_diff) - if sort: - try: - the_diff = sorting.safe_sort(the_diff) - except TypeError: - pass + other = ensure_index(other) - return this._shallow_copy(the_diff, name=result_name, freq=None) + if len(other) == 0 and how in ('left', 'outer'): + join_index = self._shallow_copy() + if return_indexers: + rindexer = np.repeat(-1, len(join_index)) + return join_index, None, rindexer + else: + return join_index - def symmetric_difference(self, other, result_name=None): - """ - Compute the symmetric difference of two Index objects. + if len(self) == 0 and how in ('right', 'outer'): + join_index = other._shallow_copy() + if return_indexers: + lindexer = np.repeat(-1, len(join_index)) + return join_index, lindexer, None + else: + return join_index - It's sorted if sorting is possible. + if self._join_precedence < other._join_precedence: + how = {'right': 'left', 'left': 'right'}.get(how, how) + result = other.join(self, how=how, level=level, + return_indexers=return_indexers) + if return_indexers: + x, y, z = result + result = x, z, y + return result - Parameters - ---------- - other : Index or array-like - result_name : str + if not is_dtype_equal(self.dtype, other.dtype): + this = self.astype('O') + other = other.astype('O') + return this.join(other, how=how, return_indexers=return_indexers) - Returns - ------- - symmetric_difference : Index + _validate_join_method(how) - Notes - ----- - ``symmetric_difference`` contains elements that appear in either - ``idx1`` or ``idx2`` but not both. Equivalent to the Index created by - ``idx1.difference(idx2) | idx2.difference(idx1)`` with duplicates - dropped. + if not self.is_unique and not other.is_unique: + return self._join_non_unique(other, how=how, + return_indexers=return_indexers) + elif not self.is_unique or not other.is_unique: + if self.is_monotonic and other.is_monotonic: + return self._join_monotonic(other, how=how, + return_indexers=return_indexers) + else: + return self._join_non_unique(other, how=how, + return_indexers=return_indexers) + elif self.is_monotonic and other.is_monotonic: + try: + return self._join_monotonic(other, how=how, + return_indexers=return_indexers) + except TypeError: + pass - Examples - -------- - >>> idx1 = pd.Index([1, 2, 3, 4]) - >>> idx2 = pd.Index([2, 3, 4, 5]) - >>> idx1.symmetric_difference(idx2) - Int64Index([1, 5], dtype='int64') + if how == 'left': + join_index = self + elif how == 'right': + join_index = other + elif how == 'inner': + join_index = self.intersection(other) + elif how == 'outer': + join_index = self.union(other) - You can also use the ``^`` operator: + if sort: + join_index = join_index.sort_values() - >>> idx1 ^ idx2 - Int64Index([1, 5], dtype='int64') - """ - self._assert_can_do_setop(other) - other, result_name_update = self._convert_can_do_setop(other) - if result_name is None: - result_name = result_name_update + if return_indexers: + if join_index is self: + lindexer = None + else: + lindexer = self.get_indexer(join_index) + if join_index is other: + rindexer = None + else: + rindexer = other.get_indexer(join_index) + return join_index, lindexer, rindexer + else: + return join_index - this = self._get_unique_index() - other = other._get_unique_index() - indexer = this.get_indexer(other) + def _join_multi(self, other, how, return_indexers=True): + from .multi import MultiIndex + from pandas.core.reshape.merge import _restore_dropped_levels_multijoin - # {this} minus {other} - common_indexer = indexer.take((indexer != -1).nonzero()[0]) - left_indexer = np.setdiff1d(np.arange(this.size), common_indexer, - assume_unique=True) - left_diff = this.values.take(left_indexer) + # figure out join names + self_names = set(com._not_none(*self.names)) + other_names = set(com._not_none(*other.names)) + overlap = self_names & other_names - # {other} minus {this} - right_indexer = (indexer == -1).nonzero()[0] - right_diff = other.values.take(right_indexer) + # need at least 1 in common + if not overlap: + raise ValueError("cannot join with no overlapping index names") - the_diff = _concat._concat_compat([left_diff, right_diff]) - try: - the_diff = sorting.safe_sort(the_diff) - except TypeError: - pass + self_is_mi = isinstance(self, MultiIndex) + other_is_mi = isinstance(other, MultiIndex) - attribs = self._get_attributes_dict() - attribs['name'] = result_name - if 'freq' in attribs: - attribs['freq'] = None - return self._shallow_copy_with_infer(the_diff, **attribs) + if self_is_mi and other_is_mi: - def _get_unique_index(self, dropna=False): - """ - Returns an index containing unique values. + # Drop the non-matching levels from left and right respectively + ldrop_names = list(self_names - overlap) + rdrop_names = list(other_names - overlap) - Parameters - ---------- - dropna : bool - If True, NaN values are dropped. + self_jnlevels = self.droplevel(ldrop_names) + other_jnlevels = other.droplevel(rdrop_names) - Returns - ------- - uniques : index - """ - if self.is_unique and not dropna: - return self + # Join left and right + # Join on same leveled multi-index frames is supported + join_idx, lidx, ridx = self_jnlevels.join(other_jnlevels, how, + return_indexers=True) - values = self.values + # Restore the dropped levels + # Returned index level order is + # common levels, ldrop_names, rdrop_names + dropped_names = ldrop_names + rdrop_names - if not self.is_unique: - values = self.unique() + levels, labels, names = ( + _restore_dropped_levels_multijoin(self, other, + dropped_names, + join_idx, + lidx, ridx)) - if dropna: - try: - if self.hasnans: - values = values[~isna(values)] - except NotImplementedError: - pass + # Re-create the multi-index + multi_join_idx = MultiIndex(levels=levels, labels=labels, + names=names, verify_integrity=False) - return self._shallow_copy(values) + multi_join_idx = multi_join_idx.remove_unused_levels() - _index_shared_docs['get_loc'] = """ - Get integer location, slice or boolean mask for requested label. + return multi_join_idx, lidx, ridx - Parameters - ---------- - key : label - method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional - * default: exact matches only. - * pad / ffill: find the PREVIOUS index value if no exact match. - * backfill / bfill: use NEXT index value if no exact match - * nearest: use the NEAREST index value if no exact match. Tied - distances are broken by preferring the larger index value. - tolerance : optional - Maximum distance from index value for inexact matches. The value of - the index at the matching location most satisfy the equation - ``abs(index[loc] - key) <= tolerance``. + jl = list(overlap)[0] - Tolerance may be a scalar - value, which applies the same tolerance to all values, or - list-like, which applies variable tolerance per element. List-like - includes list, tuple, array, Series, and must be the same size as - the index and its dtype must exactly match the index's type. + # Case where only one index is multi + # make the indices into mi's that match + flip_order = False + if self_is_mi: + self, other = other, self + flip_order = True + # flip if join method is right or left + how = {'right': 'left', 'left': 'right'}.get(how, how) - .. versionadded:: 0.21.0 (list-like tolerance) + level = other.names.index(jl) + result = self._join_level(other, level, how=how, + return_indexers=return_indexers) - Returns - ------- - loc : int if unique index, slice if monotonic index, else mask + if flip_order: + if isinstance(result, tuple): + return result[0], result[2], result[1] + return result - Examples - --------- - >>> unique_index = pd.Index(list('abc')) - >>> unique_index.get_loc('b') - 1 + def _join_non_unique(self, other, how='left', return_indexers=False): + from pandas.core.reshape.merge import _get_join_indexers - >>> monotonic_index = pd.Index(list('abbc')) - >>> monotonic_index.get_loc('b') - slice(1, 3, None) + left_idx, right_idx = _get_join_indexers([self._ndarray_values], + [other._ndarray_values], + how=how, + sort=True) - >>> non_monotonic_index = pd.Index(list('abcb')) - >>> non_monotonic_index.get_loc('b') - array([False, True, False, True], dtype=bool) - """ + left_idx = ensure_platform_int(left_idx) + right_idx = ensure_platform_int(right_idx) - @Appender(_index_shared_docs['get_loc']) - def get_loc(self, key, method=None, tolerance=None): - if method is None: - if tolerance is not None: - raise ValueError('tolerance argument only valid if using pad, ' - 'backfill or nearest lookups') - try: - return self._engine.get_loc(key) - except KeyError: - return self._engine.get_loc(self._maybe_cast_indexer(key)) - indexer = self.get_indexer([key], method=method, tolerance=tolerance) - if indexer.ndim > 1 or indexer.size > 1: - raise TypeError('get_loc requires scalar valued input') - loc = indexer.item() - if loc == -1: - raise KeyError(key) - return loc + join_index = np.asarray(self._ndarray_values.take(left_idx)) + mask = left_idx == -1 + np.putmask(join_index, mask, other._ndarray_values.take(right_idx)) - def get_value(self, series, key): - """ - Fast lookup of value from 1-dimensional ndarray. Only use this if you - know what you're doing. - """ + join_index = self._wrap_joined_index(join_index, other) - # if we have something that is Index-like, then - # use this, e.g. DatetimeIndex - s = getattr(series, '_values', None) - if isinstance(s, (ExtensionArray, Index)) and is_scalar(key): - # GH 20882, 21257 - # Unify Index and ExtensionArray treatment - # First try to convert the key to a location - # If that fails, raise a KeyError if an integer - # index, otherwise, see if key is an integer, and - # try that - try: - iloc = self.get_loc(key) - return s[iloc] - except KeyError: - if (len(self) > 0 and - (self.holds_integer() or self.is_boolean())): - raise - elif is_integer(key): - return s[key] + if return_indexers: + return join_index, left_idx, right_idx + else: + return join_index - s = com.values_from_object(series) - k = com.values_from_object(key) + def _join_level(self, other, level, how='left', return_indexers=False, + keep_order=True): + """ + The join method *only* affects the level of the resulting + MultiIndex. Otherwise it just exactly aligns the Index data to the + labels of the level in the MultiIndex. - k = self._convert_scalar_indexer(k, kind='getitem') - try: - return self._engine.get_value(s, k, - tz=getattr(series.dtype, 'tz', None)) - except KeyError as e1: - if len(self) > 0 and (self.holds_integer() or self.is_boolean()): - raise - - try: - return libindex.get_value_box(s, key) - except IndexError: - raise - except TypeError: - # generator/iterator-like - if is_iterator(key): - raise InvalidIndexError(key) - else: - raise e1 - except Exception: # pragma: no cover - raise e1 - except TypeError: - # python 3 - if is_scalar(key): # pragma: no cover - raise IndexError(key) - raise InvalidIndexError(key) - - def set_value(self, arr, key, value): - """ - Fast lookup of value from 1-dimensional ndarray. - - Notes - ----- - Only use this if you know what you're doing. + If ```keep_order == True```, the order of the data indexed by the + MultiIndex will not be changed; otherwise, it will tie out + with `other`. """ - self._engine.set_value(com.values_from_object(arr), - com.values_from_object(key), value) + from .multi import MultiIndex - def _get_level_values(self, level): - """ - Return an Index of values for requested level. + def _get_leaf_sorter(labels): + """ + Returns sorter for the inner most level while preserving the + order of higher levels. + """ + if labels[0].size == 0: + return np.empty(0, dtype='int64') - This is primarily useful to get an individual level of values from a - MultiIndex, but is provided on Index as well for compatability. + if len(labels) == 1: + lab = ensure_int64(labels[0]) + sorter, _ = libalgos.groupsort_indexer(lab, 1 + lab.max()) + return sorter - Parameters - ---------- - level : int or str - It is either the integer position or the name of the level. + # find indexers of beginning of each set of + # same-key labels w.r.t all but last level + tic = labels[0][:-1] != labels[0][1:] + for lab in labels[1:-1]: + tic |= lab[:-1] != lab[1:] - Returns - ------- - values : Index - Calling object, as there is only one level in the Index. + starts = np.hstack(([True], tic, [True])).nonzero()[0] + lab = ensure_int64(labels[-1]) + return lib.get_level_sorter(lab, ensure_int64(starts)) - See Also - -------- - MultiIndex.get_level_values : Get values for a level of a MultiIndex. + if isinstance(self, MultiIndex) and isinstance(other, MultiIndex): + raise TypeError('Join on level between two MultiIndex objects ' + 'is ambiguous') - Notes - ----- - For Index, level should be 0, since there are no multiple levels. + left, right = self, other - Examples - -------- + flip_order = not isinstance(self, MultiIndex) + if flip_order: + left, right = right, left + how = {'right': 'left', 'left': 'right'}.get(how, how) - >>> idx = pd.Index(list('abc')) - >>> idx - Index(['a', 'b', 'c'], dtype='object') + level = left._get_level_number(level) + old_level = left.levels[level] - Get level values by supplying `level` as integer: + if not right.is_unique: + raise NotImplementedError('Index._join_level on non-unique index ' + 'is not implemented') - >>> idx.get_level_values(0) - Index(['a', 'b', 'c'], dtype='object') - """ - self._validate_index_level(level) - return self + new_level, left_lev_indexer, right_lev_indexer = \ + old_level.join(right, how=how, return_indexers=True) - get_level_values = _get_level_values + if left_lev_indexer is None: + if keep_order or len(left) == 0: + left_indexer = None + join_index = left + else: # sort the leaves + left_indexer = _get_leaf_sorter(left.labels[:level + 1]) + join_index = left[left_indexer] - def droplevel(self, level=0): - """ - Return index with requested level(s) removed. + else: + left_lev_indexer = ensure_int64(left_lev_indexer) + rev_indexer = lib.get_reverse_indexer(left_lev_indexer, + len(old_level)) - If resulting index has only 1 level left, the result will be - of Index type, not MultiIndex. + new_lev_labels = algos.take_nd(rev_indexer, left.labels[level], + allow_fill=False) - .. versionadded:: 0.23.1 (support for non-MultiIndex) + new_labels = list(left.labels) + new_labels[level] = new_lev_labels - Parameters - ---------- - level : int, str, or list-like, default 0 - If a string is given, must be the name of a level - If list-like, elements must be names or indexes of levels. + new_levels = list(left.levels) + new_levels[level] = new_level - Returns - ------- - index : Index or MultiIndex - """ - if not isinstance(level, (tuple, list)): - level = [level] + if keep_order: # just drop missing values. o.w. keep order + left_indexer = np.arange(len(left), dtype=np.intp) + mask = new_lev_labels != -1 + if not mask.all(): + new_labels = [lab[mask] for lab in new_labels] + left_indexer = left_indexer[mask] - levnums = sorted(self._get_level_number(lev) for lev in level)[::-1] + else: # tie out the order with other + if level == 0: # outer most level, take the fast route + ngroups = 1 + new_lev_labels.max() + left_indexer, counts = libalgos.groupsort_indexer( + new_lev_labels, ngroups) - if len(level) == 0: - return self - if len(level) >= self.nlevels: - raise ValueError("Cannot remove {} levels from an index with {} " - "levels: at least one level must be " - "left.".format(len(level), self.nlevels)) - # The two checks above guarantee that here self is a MultiIndex + # missing values are placed first; drop them! + left_indexer = left_indexer[counts[0]:] + new_labels = [lab[left_indexer] for lab in new_labels] - new_levels = list(self.levels) - new_labels = list(self.labels) - new_names = list(self.names) + else: # sort the leaves + mask = new_lev_labels != -1 + mask_all = mask.all() + if not mask_all: + new_labels = [lab[mask] for lab in new_labels] - for i in levnums: - new_levels.pop(i) - new_labels.pop(i) - new_names.pop(i) + left_indexer = _get_leaf_sorter(new_labels[:level + 1]) + new_labels = [lab[left_indexer] for lab in new_labels] - if len(new_levels) == 1: + # left_indexers are w.r.t masked frame. + # reverse to original frame! + if not mask_all: + left_indexer = mask.nonzero()[0][left_indexer] - # set nan if needed - mask = new_labels[0] == -1 - result = new_levels[0].take(new_labels[0]) - if mask.any(): - result = result.putmask(mask, np.nan) + join_index = MultiIndex(levels=new_levels, labels=new_labels, + names=left.names, verify_integrity=False) - result.name = new_names[0] - return result + if right_lev_indexer is not None: + right_indexer = algos.take_nd(right_lev_indexer, + join_index.labels[level], + allow_fill=False) else: - from .multi import MultiIndex - return MultiIndex(levels=new_levels, labels=new_labels, - names=new_names, verify_integrity=False) - - _index_shared_docs['get_indexer'] = """ - Compute indexer and mask for new index given the current index. The - indexer should be then used as an input to ndarray.take to align the - current data to the new index. - - Parameters - ---------- - target : %(target_klass)s - method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional - * default: exact matches only. - * pad / ffill: find the PREVIOUS index value if no exact match. - * backfill / bfill: use NEXT index value if no exact match - * nearest: use the NEAREST index value if no exact match. Tied - distances are broken by preferring the larger index value. - limit : int, optional - Maximum number of consecutive labels in ``target`` to match for - inexact matches. - tolerance : optional - Maximum distance between original and new labels for inexact - matches. The values of the index at the matching locations most - satisfy the equation ``abs(index[indexer] - target) <= tolerance``. - - Tolerance may be a scalar value, which applies the same tolerance - to all values, or list-like, which applies variable tolerance per - element. List-like includes list, tuple, array, Series, and must be - the same size as the index and its dtype must exactly match the - index's type. + right_indexer = join_index.labels[level] - .. versionadded:: 0.21.0 (list-like tolerance) + if flip_order: + left_indexer, right_indexer = right_indexer, left_indexer - Returns - ------- - indexer : ndarray of int - Integers from 0 to n - 1 indicating that the index at these - positions matches the corresponding target values. Missing values - in the target are marked by -1. + if return_indexers: + left_indexer = (None if left_indexer is None + else ensure_platform_int(left_indexer)) + right_indexer = (None if right_indexer is None + else ensure_platform_int(right_indexer)) + return join_index, left_indexer, right_indexer + else: + return join_index - Examples - -------- - >>> index = pd.Index(['c', 'a', 'b']) - >>> index.get_indexer(['a', 'b', 'x']) - array([ 1, 2, -1]) + def _join_monotonic(self, other, how='left', return_indexers=False): + if self.equals(other): + ret_index = other if how == 'right' else self + if return_indexers: + return ret_index, None, None + else: + return ret_index - Notice that the return value is an array of locations in ``index`` - and ``x`` is marked by -1, as it is not in ``index``. - """ + sv = self._ndarray_values + ov = other._ndarray_values - @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) - def get_indexer(self, target, method=None, limit=None, tolerance=None): - method = missing.clean_reindex_fill_method(method) - target = ensure_index(target) - if tolerance is not None: - tolerance = self._convert_tolerance(tolerance, target) + if self.is_unique and other.is_unique: + # We can perform much better than the general case + if how == 'left': + join_index = self + lidx = None + ridx = self._left_indexer_unique(sv, ov) + elif how == 'right': + join_index = other + lidx = self._left_indexer_unique(ov, sv) + ridx = None + elif how == 'inner': + join_index, lidx, ridx = self._inner_indexer(sv, ov) + join_index = self._wrap_joined_index(join_index, other) + elif how == 'outer': + join_index, lidx, ridx = self._outer_indexer(sv, ov) + join_index = self._wrap_joined_index(join_index, other) + else: + if how == 'left': + join_index, lidx, ridx = self._left_indexer(sv, ov) + elif how == 'right': + join_index, ridx, lidx = self._left_indexer(ov, sv) + elif how == 'inner': + join_index, lidx, ridx = self._inner_indexer(sv, ov) + elif how == 'outer': + join_index, lidx, ridx = self._outer_indexer(sv, ov) + join_index = self._wrap_joined_index(join_index, other) - # Treat boolean labels passed to a numeric index as not found. Without - # this fix False and True would be treated as 0 and 1 respectively. - # (GH #16877) - if target.is_boolean() and self.is_numeric(): - return ensure_platform_int(np.repeat(-1, target.size)) + if return_indexers: + lidx = None if lidx is None else ensure_platform_int(lidx) + ridx = None if ridx is None else ensure_platform_int(ridx) + return join_index, lidx, ridx + else: + return join_index - pself, ptarget = self._maybe_promote(target) - if pself is not self or ptarget is not target: - return pself.get_indexer(ptarget, method=method, limit=limit, - tolerance=tolerance) + def _wrap_joined_index(self, joined, other): + name = get_op_result_name(self, other) + return Index(joined, name=name) - if not is_dtype_equal(self.dtype, target.dtype): - this = self.astype(object) - target = target.astype(object) - return this.get_indexer(target, method=method, limit=limit, - tolerance=tolerance) + # -------------------------------------------------------------------- + # Uncategorized Methods - if not self.is_unique: - raise InvalidIndexError('Reindexing only valid with uniquely' - ' valued Index objects') + @property + def values(self): + """ + Return the underlying data as an ndarray. + """ + return self._data.view(np.ndarray) - if method == 'pad' or method == 'backfill': - indexer = self._get_fill_indexer(target, method, limit, tolerance) - elif method == 'nearest': - indexer = self._get_nearest_indexer(target, limit, tolerance) - else: - if tolerance is not None: - raise ValueError('tolerance argument only valid if doing pad, ' - 'backfill or nearest reindexing') - if limit is not None: - raise ValueError('limit argument only valid if doing pad, ' - 'backfill or nearest reindexing') + @property + def _values(self): + # type: () -> Union[ExtensionArray, Index, np.ndarray] + # TODO(EA): remove index types as they become extension arrays + """ + The best array representation. - indexer = self._engine.get_indexer(target._ndarray_values) + This is an ndarray, ExtensionArray, or Index subclass. This differs + from ``_ndarray_values``, which always returns an ndarray. - return ensure_platform_int(indexer) + Both ``_values`` and ``_ndarray_values`` are consistent between + ``Series`` and ``Index``. - def _convert_tolerance(self, tolerance, target): - # override this method on subclasses - tolerance = np.asarray(tolerance) - if target.size != tolerance.size and tolerance.size > 1: - raise ValueError('list-like tolerance size must match ' - 'target index size') - return tolerance + It may differ from the public '.values' method. - def _get_fill_indexer(self, target, method, limit=None, tolerance=None): - if self.is_monotonic_increasing and target.is_monotonic_increasing: - method = (self._engine.get_pad_indexer if method == 'pad' else - self._engine.get_backfill_indexer) - indexer = method(target._ndarray_values, limit) - else: - indexer = self._get_fill_indexer_searchsorted(target, method, - limit) - if tolerance is not None: - indexer = self._filter_indexer_tolerance(target._ndarray_values, - indexer, - tolerance) - return indexer + index | values | _values | _ndarray_values | + ----------------- | --------------- | ------------- | --------------- | + Index | ndarray | ndarray | ndarray | + CategoricalIndex | Categorical | Categorical | ndarray[int] | + DatetimeIndex | ndarray[M8ns] | ndarray[M8ns] | ndarray[M8ns] | + DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] | ndarray[M8ns] | + PeriodIndex | ndarray[object] | PeriodArray | ndarray[int] | + IntervalIndex | IntervalArray | IntervalArray | ndarray[object] | - def _get_fill_indexer_searchsorted(self, target, method, limit=None): - """ - Fallback pad/backfill get_indexer that works for monotonic decreasing - indexes and non-monotonic targets. + See Also + -------- + values + _ndarray_values """ - if limit is not None: - raise ValueError('limit argument for %r method only well-defined ' - 'if index and target are monotonic' % method) + return self.values - side = 'left' if method == 'pad' else 'right' + def get_values(self): + """ + Return `Index` data as an `numpy.ndarray`. - # find exact matches first (this simplifies the algorithm) - indexer = self.get_indexer(target) - nonexact = (indexer == -1) - indexer[nonexact] = self._searchsorted_monotonic(target[nonexact], - side) - if side == 'left': - # searchsorted returns "indices into a sorted array such that, - # if the corresponding elements in v were inserted before the - # indices, the order of a would be preserved". - # Thus, we need to subtract 1 to find values to the left. - indexer[nonexact] -= 1 - # This also mapped not found values (values of 0 from - # np.searchsorted) to -1, which conveniently is also our - # sentinel for missing values - else: - # Mark indices to the right of the largest value as not found - indexer[indexer == len(self)] = -1 - return indexer + Returns + ------- + numpy.ndarray + A one-dimensional numpy array of the `Index` values. - def _get_nearest_indexer(self, target, limit, tolerance): - """ - Get the indexer for the nearest index labels; requires an index with - values that can be subtracted from each other (e.g., not strings or - tuples). - """ - left_indexer = self.get_indexer(target, 'pad', limit=limit) - right_indexer = self.get_indexer(target, 'backfill', limit=limit) + See Also + -------- + Index.values : The attribute that get_values wraps. - target = np.asarray(target) - left_distances = abs(self.values[left_indexer] - target) - right_distances = abs(self.values[right_indexer] - target) + Examples + -------- + Getting the `Index` values of a `DataFrame`: - op = operator.lt if self.is_monotonic_increasing else operator.le - indexer = np.where(op(left_distances, right_distances) | - (right_indexer == -1), left_indexer, right_indexer) - if tolerance is not None: - indexer = self._filter_indexer_tolerance(target, indexer, - tolerance) - return indexer + >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + ... index=['a', 'b', 'c'], columns=['A', 'B', 'C']) + >>> df + A B C + a 1 2 3 + b 4 5 6 + c 7 8 9 + >>> df.index.get_values() + array(['a', 'b', 'c'], dtype=object) - def _filter_indexer_tolerance(self, target, indexer, tolerance): - distance = abs(self.values[indexer] - target) - indexer = np.where(distance <= tolerance, indexer, -1) - return indexer + Standalone `Index` values: - _index_shared_docs['get_indexer_non_unique'] = """ - Compute indexer and mask for new index given the current index. The - indexer should be then used as an input to ndarray.take to align the - current data to the new index. + >>> idx = pd.Index(['1', '2', '3']) + >>> idx.get_values() + array(['1', '2', '3'], dtype=object) - Parameters - ---------- - target : %(target_klass)s + `MultiIndex` arrays also have only one dimension: - Returns - ------- - indexer : ndarray of int - Integers from 0 to n - 1 indicating that the index at these - positions matches the corresponding target values. Missing values - in the target are marked by -1. - missing : ndarray of int - An indexer into the target of the values not found. - These correspond to the -1 in the indexer array + >>> midx = pd.MultiIndex.from_arrays([[1, 2, 3], ['a', 'b', 'c']], + ... names=('number', 'letter')) + >>> midx.get_values() + array([(1, 'a'), (2, 'b'), (3, 'c')], dtype=object) + >>> midx.get_values().ndim + 1 """ + return self.values - @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) - def get_indexer_non_unique(self, target): - target = ensure_index(target) - if is_categorical(target): - target = target.astype(target.dtype.categories.dtype) - pself, ptarget = self._maybe_promote(target) - if pself is not self or ptarget is not target: - return pself.get_indexer_non_unique(ptarget) + @Appender(IndexOpsMixin.memory_usage.__doc__) + def memory_usage(self, deep=False): + result = super(Index, self).memory_usage(deep=deep) - if self.is_all_dates: - self = Index(self.asi8) - tgt_values = target.asi8 - else: - tgt_values = target._ndarray_values + # include our engine hashtable + result += self._engine.sizeof(deep=deep) + return result - indexer, missing = self._engine.get_indexer_non_unique(tgt_values) - return ensure_platform_int(indexer), missing + _index_shared_docs['where'] = """ + Return an Index of same shape as self and whose corresponding + entries are from self where cond is True and otherwise are from + other. - def get_indexer_for(self, target, **kwargs): - """ - Guaranteed return of an indexer even when non-unique. + .. versionadded:: 0.19.0 - This dispatches to get_indexer or get_indexer_nonunique - as appropriate. + Parameters + ---------- + cond : boolean array-like with the same length as self + other : scalar, or array-like """ - if self.is_unique: - return self.get_indexer(target, **kwargs) - indexer, _ = self.get_indexer_non_unique(target, **kwargs) - return indexer - def _maybe_promote(self, other): - # A hack, but it works - from pandas import DatetimeIndex - if self.inferred_type == 'date' and isinstance(other, DatetimeIndex): - return DatetimeIndex(self), other - elif self.inferred_type == 'boolean': - if not is_object_dtype(self.dtype): - return self.astype('object'), other.astype('object') - return self, other + @Appender(_index_shared_docs['where']) + def where(self, cond, other=None): + if other is None: + other = self._na_value - def groupby(self, values): - """ - Group the index labels by a given array of values. + dtype = self.dtype + values = self.values - Parameters - ---------- - values : array - Values used to determine the groups. + if is_bool(other) or is_bool_dtype(other): - Returns - ------- - groups : dict - {group name -> group labels} - """ + # bools force casting + values = values.astype(object) + dtype = None - # TODO: if we are a MultiIndex, we can do better - # that converting to tuples - from .multi import MultiIndex - if isinstance(values, MultiIndex): - values = values.values - values = ensure_categorical(values) - result = values._reverse_indexer() + values = np.where(cond, values, other) - # map to the label - result = {k: self.take(v) for k, v in compat.iteritems(result)} + if self._is_numeric_dtype and np.any(isna(values)): + # We can't coerce to the numeric dtype of "self" (unless + # it's float) if there are NaN values in our output. + dtype = None - return result + return self._shallow_copy_with_infer(values, dtype=dtype) - def map(self, mapper, na_action=None): + # construction helpers + @classmethod + def _try_convert_to_int_index(cls, data, copy, name, dtype): """ - Map values using input correspondence (a dict, Series, or function). + Attempt to convert an array of data into an integer index. Parameters ---------- - mapper : function, dict, or Series - Mapping correspondence. - na_action : {None, 'ignore'} - If 'ignore', propagate NA values, without passing them to the - mapping correspondence. + data : The data to convert. + copy : Whether to copy the data or not. + name : The name of the index returned. Returns ------- - applied : Union[Index, MultiIndex], inferred - The output of the mapping function applied to the index. - If the function returns a tuple with more than one element - a MultiIndex will be returned. + int_index : data converted to either an Int64Index or a + UInt64Index + + Raises + ------ + ValueError if the conversion was not successful. """ - from .multi import MultiIndex - new_values = super(Index, self)._map_values( - mapper, na_action=na_action) + from .numeric import Int64Index, UInt64Index + if not is_unsigned_integer_dtype(dtype): + # skip int64 conversion attempt if uint-like dtype is passed, as + # this could return Int64Index when UInt64Index is what's desrired + try: + res = data.astype('i8', copy=False) + if (res == data).all(): + return Int64Index(res, copy=copy, name=name) + except (OverflowError, TypeError, ValueError): + pass - attributes = self._get_attributes_dict() + # Conversion to int64 failed (possibly due to overflow) or was skipped, + # so let's try now with uint64. + try: + res = data.astype('u8', copy=False) + if (res == data).all(): + return UInt64Index(res, copy=copy, name=name) + except (OverflowError, TypeError, ValueError): + pass - # we can return a MultiIndex - if new_values.size and isinstance(new_values[0], tuple): - if isinstance(self, MultiIndex): - names = self.names - elif attributes.get('name'): - names = [attributes.get('name')] * len(new_values[0]) - else: - names = None - return MultiIndex.from_tuples(new_values, - names=names) + raise ValueError - attributes['copy'] = False - if not new_values.size: - # empty - attributes['dtype'] = self.dtype + @classmethod + def _scalar_data_error(cls, data): + raise TypeError('{0}(...) must be called with a collection of some ' + 'kind, {1} was passed'.format(cls.__name__, + repr(data))) - return Index(new_values, **attributes) + @classmethod + def _string_data_error(cls, data): + raise TypeError('String dtype not supported, you may need ' + 'to explicitly cast to a numeric type') - def isin(self, values, level=None): + @classmethod + def _coerce_to_ndarray(cls, data): """ - Return a boolean array where the index values are in `values`. + Coerces data to ndarray. - Compute boolean array of whether each index value is found in the - passed set of values. The length of the returned boolean array matches - the length of the index. + Converts other iterables to list first and then to array. + Does not touch ndarrays. - Parameters - ---------- - values : set or list-like - Sought values. + Raises + ------ + TypeError + When the data passed in is a scalar. + """ - .. versionadded:: 0.18.1 + if not isinstance(data, (np.ndarray, Index)): + if data is None or is_scalar(data): + cls._scalar_data_error(data) - Support for values as a set. + # other iterable of some kind + if not isinstance(data, (ABCSeries, list, tuple)): + data = list(data) + data = np.asarray(data) + return data - level : str or int, optional - Name or position of the index level to use (if the index is a - `MultiIndex`). + def _coerce_scalar_to_index(self, item): + """ + We need to coerce a scalar to a compat for our index type. - Returns - ------- - is_contained : ndarray - NumPy array of boolean values. + Parameters + ---------- + item : scalar item to coerce + """ + dtype = self.dtype - See Also - -------- - Series.isin : Same for Series. - DataFrame.isin : Same method for DataFrames. + if self._is_numeric_dtype and isna(item): + # We can't coerce to the numeric dtype of "self" (unless + # it's float) if there are NaN values in our output. + dtype = None - Notes - ----- - In the case of `MultiIndex` you must either specify `values` as a - list-like object containing tuples that are the same length as the - number of levels, or specify `level`. Otherwise it will raise a - ``ValueError``. + return Index([item], dtype=dtype, **self._get_attributes_dict()) - If `level` is specified: + def _to_safe_for_reshape(self): + """ + Convert to object if we are a categorical. + """ + return self - - if it is the name of one *and only one* index level, use that level; - - otherwise it should be a number indicating level position. + def _convert_for_op(self, value): + """ + Convert value to be insertable to ndarray. + """ + return value - Examples - -------- - >>> idx = pd.Index([1,2,3]) - >>> idx - Int64Index([1, 2, 3], dtype='int64') + def _assert_can_do_op(self, value): + """ + Check value is valid for scalar op. + """ + if not is_scalar(value): + msg = "'value' must be a scalar, passed: {0}" + raise TypeError(msg.format(type(value).__name__)) - Check whether each index value in a list of values. - >>> idx.isin([1, 4]) - array([ True, False, False]) + @property + def _has_complex_internals(self): + # to disable groupby tricks in MultiIndex + return False - >>> midx = pd.MultiIndex.from_arrays([[1,2,3], - ... ['red', 'blue', 'green']], - ... names=('number', 'color')) - >>> midx - MultiIndex(levels=[[1, 2, 3], ['blue', 'green', 'red']], - labels=[[0, 1, 2], [2, 0, 1]], - names=['number', 'color']) + def _is_memory_usage_qualified(self): + """ + Return a boolean if we need a qualified .info display. + """ + return self.is_object() - Check whether the strings in the 'color' level of the MultiIndex - are in a list of colors. + def is_type_compatible(self, kind): + return kind == self.inferred_type - >>> midx.isin(['red', 'orange', 'yellow'], level='color') - array([ True, False, False]) + _index_shared_docs['__contains__'] = """ + Return a boolean if this key is IN the index. - To check across the levels of a MultiIndex, pass a list of tuples: + Parameters + ---------- + key : object - >>> midx.isin([(1, 'red'), (3, 'red')]) - array([ True, False, False]) + Returns + ------- + boolean + """ - For a DatetimeIndex, string values in `values` are converted to - Timestamps. + @Appender(_index_shared_docs['__contains__'] % _index_doc_kwargs) + def __contains__(self, key): + hash(key) + try: + return key in self._engine + except (OverflowError, TypeError, ValueError): + return False - >>> dates = ['2000-03-11', '2000-03-12', '2000-03-13'] - >>> dti = pd.to_datetime(dates) - >>> dti - DatetimeIndex(['2000-03-11', '2000-03-12', '2000-03-13'], - dtype='datetime64[ns]', freq=None) - - >>> dti.isin(['2000-03-11']) - array([ True, False, False]) - """ - if level is not None: - self._validate_index_level(level) - return algos.isin(self, values) - - def _can_reindex(self, indexer): - """ - Check if we are allowing reindexing with this particular indexer. + _index_shared_docs['contains'] = """ + Return a boolean if this key is IN the index. Parameters ---------- - indexer : an integer indexer + key : object - Raises - ------ - ValueError if its a duplicate axis + Returns + ------- + boolean """ - # trying to reindex on an axis with duplicates - if not self.is_unique and len(indexer): - raise ValueError("cannot reindex from a duplicate axis") + @Appender(_index_shared_docs['contains'] % _index_doc_kwargs) + def contains(self, key): + hash(key) + try: + return key in self._engine + except (TypeError, ValueError): + return False - def reindex(self, target, method=None, level=None, limit=None, - tolerance=None): + def __hash__(self): + raise TypeError("unhashable type: %r" % type(self).__name__) + + def __setitem__(self, key, value): + raise TypeError("Index does not support mutable operations") + + def __getitem__(self, key): """ - Create index with target's values (move/add/delete values - as necessary). + Override numpy.ndarray's __getitem__ method to work as desired. - Parameters - ---------- - target : an iterable + This function adds lists and Series as valid boolean indexers + (ndarrays only supports ndarray with dtype=bool). - Returns - ------- - new_index : pd.Index - Resulting index - indexer : np.ndarray or None - Indices of output values in original index + If resulting ndim != 1, plain ndarray is returned instead of + corresponding `Index` subclass. """ - # GH6552: preserve names when reindexing to non-named target - # (i.e. neither Index nor Series). - preserve_names = not hasattr(target, 'name') + # There's no custom logic to be implemented in __getslice__, so it's + # not overloaded intentionally. + getitem = self._data.__getitem__ + promote = self._shallow_copy - # GH7774: preserve dtype/tz if target is empty and not an Index. - target = _ensure_has_len(target) # target may be an iterator + if is_scalar(key): + key = com.cast_scalar_indexer(key) + return getitem(key) - if not isinstance(target, Index) and len(target) == 0: - attrs = self._get_attributes_dict() - attrs.pop('freq', None) # don't preserve freq - values = self._data[:0] # appropriately-dtyped empty array - target = self._simple_new(values, dtype=self.dtype, **attrs) - else: - target = ensure_index(target) + if isinstance(key, slice): + # This case is separated from the conditional above to avoid + # pessimization of basic indexing. + return promote(getitem(key)) - if level is not None: - if method is not None: - raise TypeError('Fill method not supported if level passed') - _, indexer, _ = self._join_level(target, level, how='right', - return_indexers=True) - else: - if self.equals(target): - indexer = None - else: + if com.is_bool_indexer(key): + key = np.asarray(key, dtype=bool) - if self.is_unique: - indexer = self.get_indexer(target, method=method, - limit=limit, - tolerance=tolerance) - else: - if method is not None or limit is not None: - raise ValueError("cannot reindex a non-unique index " - "with a method or limit") - indexer, missing = self.get_indexer_non_unique(target) + key = com.values_from_object(key) + result = getitem(key) + if not is_scalar(result): + return promote(result) + else: + return result - if preserve_names and target.nlevels == 1 and target.name != self.name: - target = target.copy() - target.name = self.name + def _can_hold_identifiers_and_holds_name(self, name): + """ + Faster check for ``name in self`` when we know `name` is a Python + identifier (e.g. in NDFrame.__getattr__, which hits this to support + . key lookup). For indexes that can't hold identifiers (everything + but object & categorical) we just return False. - return target, indexer + https://github.com/pandas-dev/pandas/issues/19764 + """ + if self.is_object() or self.is_categorical(): + return name in self + return False - def _reindex_non_unique(self, target): + def append(self, other): """ - Create a new index with target's values (move/add/delete values as - necessary) use with non-unique Index and a possibly non-unique target. + Append a collection of Index options together. Parameters ---------- - target : an iterable + other : Index or list/tuple of indices Returns ------- - new_index : pd.Index - Resulting index - indexer : np.ndarray or None - Indices of output values in original index - + appended : Index """ - target = ensure_index(target) - indexer, missing = self.get_indexer_non_unique(target) - check = indexer != -1 - new_labels = self.take(indexer[check]) - new_indexer = None - - if len(missing): - length = np.arange(len(indexer)) - - missing = ensure_platform_int(missing) - missing_labels = target.take(missing) - missing_indexer = ensure_int64(length[~check]) - cur_labels = self.take(indexer[check]).values - cur_indexer = ensure_int64(length[check]) - - new_labels = np.empty(tuple([len(indexer)]), dtype=object) - new_labels[cur_indexer] = cur_labels - new_labels[missing_indexer] = missing_labels + to_concat = [self] - # a unique indexer - if target.is_unique: + if isinstance(other, (list, tuple)): + to_concat = to_concat + list(other) + else: + to_concat.append(other) - # see GH5553, make sure we use the right indexer - new_indexer = np.arange(len(indexer)) - new_indexer[cur_indexer] = np.arange(len(cur_labels)) - new_indexer[missing_indexer] = -1 + for obj in to_concat: + if not isinstance(obj, Index): + raise TypeError('all inputs must be Index') - # we have a non_unique selector, need to use the original - # indexer here - else: + names = {obj.name for obj in to_concat} + name = None if len(names) > 1 else self.name - # need to retake to have the same size as the indexer - indexer[~check] = -1 + return self._concat(to_concat, name) - # reset the new indexer to account for the new size - new_indexer = np.arange(len(self.take(indexer))) - new_indexer[~check] = -1 + def _concat(self, to_concat, name): - new_index = self._shallow_copy_with_infer(new_labels, freq=None) - return new_index, indexer, new_indexer + typs = _concat.get_dtype_kinds(to_concat) - _index_shared_docs['join'] = """ - Compute join_index and indexers to conform data - structures to the new index. + if len(typs) == 1: + return self._concat_same_dtype(to_concat, name=name) + return _concat._concat_index_asobject(to_concat, name=name) - Parameters - ---------- - other : Index - how : {'left', 'right', 'inner', 'outer'} - level : int or level name, default None - return_indexers : boolean, default False - sort : boolean, default False - Sort the join keys lexicographically in the result Index. If False, - the order of the join keys depends on the join type (how keyword) + def _concat_same_dtype(self, to_concat, name): + """ + Concatenate to_concat which has the same class. + """ + # must be overridden in specific classes + return _concat._concat_index_asobject(to_concat, name) - .. versionadded:: 0.20.0 + def putmask(self, mask, value): + """ + Return a new Index of the values set with the mask. - Returns - ------- - join_index, (left_indexer, right_indexer) + See Also + -------- + numpy.ndarray.putmask """ + values = self.values.copy() + try: + np.putmask(values, mask, self._convert_for_op(value)) + return self._shallow_copy(values) + except (ValueError, TypeError) as err: + if is_object_dtype(self): + raise err - @Appender(_index_shared_docs['join']) - def join(self, other, how='left', level=None, return_indexers=False, - sort=False): - from .multi import MultiIndex - self_is_mi = isinstance(self, MultiIndex) - other_is_mi = isinstance(other, MultiIndex) + # coerces to object + return self.astype(object).putmask(mask, value) - # try to figure out the join level - # GH3662 - if level is None and (self_is_mi or other_is_mi): + def equals(self, other): + """ + Determines if two Index objects contain the same elements. + """ + if self.is_(other): + return True - # have the same levels/names so a simple join - if self.names == other.names: - pass - else: - return self._join_multi(other, how=how, - return_indexers=return_indexers) + if not isinstance(other, Index): + return False - # join on the level - if level is not None and (self_is_mi or other_is_mi): - return self._join_level(other, level, how=how, - return_indexers=return_indexers) + if is_object_dtype(self) and not is_object_dtype(other): + # if other is not object, use other's logic for coercion + return other.equals(self) - other = ensure_index(other) + try: + return array_equivalent(com.values_from_object(self), + com.values_from_object(other)) + except Exception: + return False - if len(other) == 0 and how in ('left', 'outer'): - join_index = self._shallow_copy() - if return_indexers: - rindexer = np.repeat(-1, len(join_index)) - return join_index, None, rindexer - else: - return join_index + def identical(self, other): + """ + Similar to equals, but check that other comparable attributes are + also equal. + """ + return (self.equals(other) and + all((getattr(self, c, None) == getattr(other, c, None) + for c in self._comparables)) and + type(self) == type(other)) - if len(self) == 0 and how in ('right', 'outer'): - join_index = other._shallow_copy() - if return_indexers: - lindexer = np.repeat(-1, len(join_index)) - return join_index, lindexer, None - else: - return join_index + def asof(self, label): + """ + Return the label from the index, or, if not present, the previous one. - if self._join_precedence < other._join_precedence: - how = {'right': 'left', 'left': 'right'}.get(how, how) - result = other.join(self, how=how, level=level, - return_indexers=return_indexers) - if return_indexers: - x, y, z = result - result = x, z, y - return result + Assuming that the index is sorted, return the passed index label if it + is in the index, or return the previous index label if the passed one + is not in the index. - if not is_dtype_equal(self.dtype, other.dtype): - this = self.astype('O') - other = other.astype('O') - return this.join(other, how=how, return_indexers=return_indexers) + Parameters + ---------- + label : object + The label up to which the method returns the latest index label. - _validate_join_method(how) + Returns + ------- + object + The passed label if it is in the index. The previous label if the + passed label is not in the sorted index or `NaN` if there is no + such label. - if not self.is_unique and not other.is_unique: - return self._join_non_unique(other, how=how, - return_indexers=return_indexers) - elif not self.is_unique or not other.is_unique: - if self.is_monotonic and other.is_monotonic: - return self._join_monotonic(other, how=how, - return_indexers=return_indexers) - else: - return self._join_non_unique(other, how=how, - return_indexers=return_indexers) - elif self.is_monotonic and other.is_monotonic: - try: - return self._join_monotonic(other, how=how, - return_indexers=return_indexers) - except TypeError: - pass + See Also + -------- + Series.asof : Return the latest value in a Series up to the + passed index. + merge_asof : Perform an asof merge (similar to left join but it + matches on nearest key rather than equal key). + Index.get_loc : An `asof` is a thin wrapper around `get_loc` + with method='pad'. - if how == 'left': - join_index = self - elif how == 'right': - join_index = other - elif how == 'inner': - join_index = self.intersection(other) - elif how == 'outer': - join_index = self.union(other) + Examples + -------- + `Index.asof` returns the latest index label up to the passed label. - if sort: - join_index = join_index.sort_values() + >>> idx = pd.Index(['2013-12-31', '2014-01-02', '2014-01-03']) + >>> idx.asof('2014-01-01') + '2013-12-31' - if return_indexers: - if join_index is self: - lindexer = None - else: - lindexer = self.get_indexer(join_index) - if join_index is other: - rindexer = None - else: - rindexer = other.get_indexer(join_index) - return join_index, lindexer, rindexer + If the label is in the index, the method returns the passed label. + + >>> idx.asof('2014-01-02') + '2014-01-02' + + If all of the labels in the index are later than the passed label, + NaN is returned. + + >>> idx.asof('1999-01-02') + nan + + If the index is not sorted, an error is raised. + + >>> idx_not_sorted = pd.Index(['2013-12-31', '2015-01-02', + ... '2014-01-03']) + >>> idx_not_sorted.asof('2013-12-31') + Traceback (most recent call last): + ValueError: index must be monotonic increasing or decreasing + """ + try: + loc = self.get_loc(label, method='pad') + except KeyError: + return self._na_value else: - return join_index + if isinstance(loc, slice): + loc = loc.indices(len(self))[-1] + return self[loc] - def _join_multi(self, other, how, return_indexers=True): - from .multi import MultiIndex - from pandas.core.reshape.merge import _restore_dropped_levels_multijoin + def asof_locs(self, where, mask): + """ + Finds the locations (indices) of the labels from the index for + every entry in the `where` argument. - # figure out join names - self_names = set(com._not_none(*self.names)) - other_names = set(com._not_none(*other.names)) - overlap = self_names & other_names + As in the `asof` function, if the label (a particular entry in + `where`) is not in the index, the latest index label upto the + passed label is chosen and its index returned. - # need at least 1 in common - if not overlap: - raise ValueError("cannot join with no overlapping index names") + If all of the labels in the index are later than a label in `where`, + -1 is returned. - self_is_mi = isinstance(self, MultiIndex) - other_is_mi = isinstance(other, MultiIndex) + `mask` is used to ignore NA values in the index during calculation. - if self_is_mi and other_is_mi: + Parameters + ---------- + where : Index + An Index consisting of an array of timestamps. + mask : array-like + Array of booleans denoting where values in the original + data are not NA. - # Drop the non-matching levels from left and right respectively - ldrop_names = list(self_names - overlap) - rdrop_names = list(other_names - overlap) + Returns + ------- + numpy.ndarray + An array of locations (indices) of the labels from the Index + which correspond to the return values of the `asof` function + for every element in `where`. + """ + locs = self.values[mask].searchsorted(where.values, side='right') + locs = np.where(locs > 0, locs - 1, 0) - self_jnlevels = self.droplevel(ldrop_names) - other_jnlevels = other.droplevel(rdrop_names) + result = np.arange(len(self))[mask].take(locs) - # Join left and right - # Join on same leveled multi-index frames is supported - join_idx, lidx, ridx = self_jnlevels.join(other_jnlevels, how, - return_indexers=True) + first = mask.argmax() + result[(locs == 0) & (where.values < self.values[first])] = -1 - # Restore the dropped levels - # Returned index level order is - # common levels, ldrop_names, rdrop_names - dropped_names = ldrop_names + rdrop_names + return result - levels, labels, names = ( - _restore_dropped_levels_multijoin(self, other, - dropped_names, - join_idx, - lidx, ridx)) + def sort_values(self, return_indexer=False, ascending=True): + """ + Return a sorted copy of the index. - # Re-create the multi-index - multi_join_idx = MultiIndex(levels=levels, labels=labels, - names=names, verify_integrity=False) + Return a sorted copy of the index, and optionally return the indices + that sorted the index itself. - multi_join_idx = multi_join_idx.remove_unused_levels() + Parameters + ---------- + return_indexer : bool, default False + Should the indices that would sort the index be returned. + ascending : bool, default True + Should the index values be sorted in an ascending order. - return multi_join_idx, lidx, ridx + Returns + ------- + sorted_index : pandas.Index + Sorted copy of the index. + indexer : numpy.ndarray, optional + The indices that the index itself was sorted by. - jl = list(overlap)[0] + See Also + -------- + pandas.Series.sort_values : Sort values of a Series. + pandas.DataFrame.sort_values : Sort values in a DataFrame. - # Case where only one index is multi - # make the indices into mi's that match - flip_order = False - if self_is_mi: - self, other = other, self - flip_order = True - # flip if join method is right or left - how = {'right': 'left', 'left': 'right'}.get(how, how) + Examples + -------- + >>> idx = pd.Index([10, 100, 1, 1000]) + >>> idx + Int64Index([10, 100, 1, 1000], dtype='int64') + + Sort values in ascending order (default behavior). + + >>> idx.sort_values() + Int64Index([1, 10, 100, 1000], dtype='int64') + + Sort values in descending order, and also get the indices `idx` was + sorted by. + + >>> idx.sort_values(ascending=False, return_indexer=True) + (Int64Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2])) + """ + _as = self.argsort() + if not ascending: + _as = _as[::-1] + + sorted_index = self.take(_as) + + if return_indexer: + return sorted_index, _as + else: + return sorted_index + + def sort(self, *args, **kwargs): + raise TypeError("cannot sort an Index object in-place, use " + "sort_values instead") + + def shift(self, periods=1, freq=None): + """ + Shift index by desired number of time frequency increments. + + This method is for shifting the values of datetime-like indexes + by a specified time increment a given number of times. + + Parameters + ---------- + periods : int, default 1 + Number of periods (or increments) to shift by, + can be positive or negative. + freq : pandas.DateOffset, pandas.Timedelta or string, optional + Frequency increment to shift by. + If None, the index is shifted by its own `freq` attribute. + Offset aliases are valid strings, e.g., 'D', 'W', 'M' etc. + + Returns + ------- + pandas.Index + shifted index + + See Also + -------- + Series.shift : Shift values of Series. + + Examples + -------- + Put the first 5 month starts of 2011 into an index. + + >>> month_starts = pd.date_range('1/1/2011', periods=5, freq='MS') + >>> month_starts + DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01', '2011-04-01', + '2011-05-01'], + dtype='datetime64[ns]', freq='MS') + + Shift the index by 10 days. + + >>> month_starts.shift(10, freq='D') + DatetimeIndex(['2011-01-11', '2011-02-11', '2011-03-11', '2011-04-11', + '2011-05-11'], + dtype='datetime64[ns]', freq=None) + + The default value of `freq` is the `freq` attribute of the index, + which is 'MS' (month start) in this example. + + >>> month_starts.shift(10) + DatetimeIndex(['2011-11-01', '2011-12-01', '2012-01-01', '2012-02-01', + '2012-03-01'], + dtype='datetime64[ns]', freq='MS') + + Notes + ----- + This method is only implemented for datetime-like index classes, + i.e., DatetimeIndex, PeriodIndex and TimedeltaIndex. + """ + raise NotImplementedError("Not supported for type %s" % + type(self).__name__) + + def argsort(self, *args, **kwargs): + """ + Return the integer indices that would sort the index. + + Parameters + ---------- + *args + Passed to `numpy.ndarray.argsort`. + **kwargs + Passed to `numpy.ndarray.argsort`. + + Returns + ------- + numpy.ndarray + Integer indices that would sort the index if used as + an indexer. + + See Also + -------- + numpy.argsort : Similar method for NumPy arrays. + Index.sort_values : Return sorted copy of Index. + + Examples + -------- + >>> idx = pd.Index(['b', 'a', 'd', 'c']) + >>> idx + Index(['b', 'a', 'd', 'c'], dtype='object') + + >>> order = idx.argsort() + >>> order + array([1, 0, 3, 2]) + + >>> idx[order] + Index(['a', 'b', 'c', 'd'], dtype='object') + """ + result = self.asi8 + if result is None: + result = np.array(self) + return result.argsort(*args, **kwargs) + + def get_value(self, series, key): + """ + Fast lookup of value from 1-dimensional ndarray. Only use this if you + know what you're doing. + """ + + # if we have something that is Index-like, then + # use this, e.g. DatetimeIndex + s = getattr(series, '_values', None) + if isinstance(s, (ExtensionArray, Index)) and is_scalar(key): + # GH 20882, 21257 + # Unify Index and ExtensionArray treatment + # First try to convert the key to a location + # If that fails, raise a KeyError if an integer + # index, otherwise, see if key is an integer, and + # try that + try: + iloc = self.get_loc(key) + return s[iloc] + except KeyError: + if (len(self) > 0 and + (self.holds_integer() or self.is_boolean())): + raise + elif is_integer(key): + return s[key] + + s = com.values_from_object(series) + k = com.values_from_object(key) + + k = self._convert_scalar_indexer(k, kind='getitem') + try: + return self._engine.get_value(s, k, + tz=getattr(series.dtype, 'tz', None)) + except KeyError as e1: + if len(self) > 0 and (self.holds_integer() or self.is_boolean()): + raise + + try: + return libindex.get_value_box(s, key) + except IndexError: + raise + except TypeError: + # generator/iterator-like + if is_iterator(key): + raise InvalidIndexError(key) + else: + raise e1 + except Exception: # pragma: no cover + raise e1 + except TypeError: + # python 3 + if is_scalar(key): # pragma: no cover + raise IndexError(key) + raise InvalidIndexError(key) + + def set_value(self, arr, key, value): + """ + Fast lookup of value from 1-dimensional ndarray. + + Notes + ----- + Only use this if you know what you're doing. + """ + self._engine.set_value(com.values_from_object(arr), + com.values_from_object(key), value) + + _index_shared_docs['get_indexer_non_unique'] = """ + Compute indexer and mask for new index given the current index. The + indexer should be then used as an input to ndarray.take to align the + current data to the new index. + + Parameters + ---------- + target : %(target_klass)s + + Returns + ------- + indexer : ndarray of int + Integers from 0 to n - 1 indicating that the index at these + positions matches the corresponding target values. Missing values + in the target are marked by -1. + missing : ndarray of int + An indexer into the target of the values not found. + These correspond to the -1 in the indexer array + """ - level = other.names.index(jl) - result = self._join_level(other, level, how=how, - return_indexers=return_indexers) + @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) + def get_indexer_non_unique(self, target): + target = ensure_index(target) + if is_categorical(target): + target = target.astype(target.dtype.categories.dtype) + pself, ptarget = self._maybe_promote(target) + if pself is not self or ptarget is not target: + return pself.get_indexer_non_unique(ptarget) - if flip_order: - if isinstance(result, tuple): - return result[0], result[2], result[1] - return result + if self.is_all_dates: + self = Index(self.asi8) + tgt_values = target.asi8 + else: + tgt_values = target._ndarray_values - def _join_non_unique(self, other, how='left', return_indexers=False): - from pandas.core.reshape.merge import _get_join_indexers + indexer, missing = self._engine.get_indexer_non_unique(tgt_values) + return ensure_platform_int(indexer), missing - left_idx, right_idx = _get_join_indexers([self._ndarray_values], - [other._ndarray_values], - how=how, - sort=True) + def get_indexer_for(self, target, **kwargs): + """ + Guaranteed return of an indexer even when non-unique. - left_idx = ensure_platform_int(left_idx) - right_idx = ensure_platform_int(right_idx) + This dispatches to get_indexer or get_indexer_nonunique + as appropriate. + """ + if self.is_unique: + return self.get_indexer(target, **kwargs) + indexer, _ = self.get_indexer_non_unique(target, **kwargs) + return indexer - join_index = np.asarray(self._ndarray_values.take(left_idx)) - mask = left_idx == -1 - np.putmask(join_index, mask, other._ndarray_values.take(right_idx)) + def _maybe_promote(self, other): + # A hack, but it works + from pandas import DatetimeIndex + if self.inferred_type == 'date' and isinstance(other, DatetimeIndex): + return DatetimeIndex(self), other + elif self.inferred_type == 'boolean': + if not is_object_dtype(self.dtype): + return self.astype('object'), other.astype('object') + return self, other - join_index = self._wrap_joined_index(join_index, other) + def groupby(self, values): + """ + Group the index labels by a given array of values. - if return_indexers: - return join_index, left_idx, right_idx - else: - return join_index + Parameters + ---------- + values : array + Values used to determine the groups. - def _join_level(self, other, level, how='left', return_indexers=False, - keep_order=True): + Returns + ------- + groups : dict + {group name -> group labels} """ - The join method *only* affects the level of the resulting - MultiIndex. Otherwise it just exactly aligns the Index data to the - labels of the level in the MultiIndex. - If ```keep_order == True```, the order of the data indexed by the - MultiIndex will not be changed; otherwise, it will tie out - with `other`. - """ + # TODO: if we are a MultiIndex, we can do better + # that converting to tuples from .multi import MultiIndex + if isinstance(values, MultiIndex): + values = values.values + values = ensure_categorical(values) + result = values._reverse_indexer() - def _get_leaf_sorter(labels): - """ - Returns sorter for the inner most level while preserving the - order of higher levels. - """ - if labels[0].size == 0: - return np.empty(0, dtype='int64') + # map to the label + result = {k: self.take(v) for k, v in compat.iteritems(result)} - if len(labels) == 1: - lab = ensure_int64(labels[0]) - sorter, _ = libalgos.groupsort_indexer(lab, 1 + lab.max()) - return sorter + return result - # find indexers of beginning of each set of - # same-key labels w.r.t all but last level - tic = labels[0][:-1] != labels[0][1:] - for lab in labels[1:-1]: - tic |= lab[:-1] != lab[1:] + def map(self, mapper, na_action=None): + """ + Map values using input correspondence (a dict, Series, or function). - starts = np.hstack(([True], tic, [True])).nonzero()[0] - lab = ensure_int64(labels[-1]) - return lib.get_level_sorter(lab, ensure_int64(starts)) + Parameters + ---------- + mapper : function, dict, or Series + Mapping correspondence. + na_action : {None, 'ignore'} + If 'ignore', propagate NA values, without passing them to the + mapping correspondence. - if isinstance(self, MultiIndex) and isinstance(other, MultiIndex): - raise TypeError('Join on level between two MultiIndex objects ' - 'is ambiguous') + Returns + ------- + applied : Union[Index, MultiIndex], inferred + The output of the mapping function applied to the index. + If the function returns a tuple with more than one element + a MultiIndex will be returned. + """ - left, right = self, other + from .multi import MultiIndex + new_values = super(Index, self)._map_values( + mapper, na_action=na_action) - flip_order = not isinstance(self, MultiIndex) - if flip_order: - left, right = right, left - how = {'right': 'left', 'left': 'right'}.get(how, how) + attributes = self._get_attributes_dict() - level = left._get_level_number(level) - old_level = left.levels[level] + # we can return a MultiIndex + if new_values.size and isinstance(new_values[0], tuple): + if isinstance(self, MultiIndex): + names = self.names + elif attributes.get('name'): + names = [attributes.get('name')] * len(new_values[0]) + else: + names = None + return MultiIndex.from_tuples(new_values, + names=names) - if not right.is_unique: - raise NotImplementedError('Index._join_level on non-unique index ' - 'is not implemented') + attributes['copy'] = False + if not new_values.size: + # empty + attributes['dtype'] = self.dtype - new_level, left_lev_indexer, right_lev_indexer = \ - old_level.join(right, how=how, return_indexers=True) + return Index(new_values, **attributes) - if left_lev_indexer is None: - if keep_order or len(left) == 0: - left_indexer = None - join_index = left - else: # sort the leaves - left_indexer = _get_leaf_sorter(left.labels[:level + 1]) - join_index = left[left_indexer] + def isin(self, values, level=None): + """ + Return a boolean array where the index values are in `values`. - else: - left_lev_indexer = ensure_int64(left_lev_indexer) - rev_indexer = lib.get_reverse_indexer(left_lev_indexer, - len(old_level)) + Compute boolean array of whether each index value is found in the + passed set of values. The length of the returned boolean array matches + the length of the index. - new_lev_labels = algos.take_nd(rev_indexer, left.labels[level], - allow_fill=False) + Parameters + ---------- + values : set or list-like + Sought values. - new_labels = list(left.labels) - new_labels[level] = new_lev_labels + .. versionadded:: 0.18.1 - new_levels = list(left.levels) - new_levels[level] = new_level + Support for values as a set. - if keep_order: # just drop missing values. o.w. keep order - left_indexer = np.arange(len(left), dtype=np.intp) - mask = new_lev_labels != -1 - if not mask.all(): - new_labels = [lab[mask] for lab in new_labels] - left_indexer = left_indexer[mask] + level : str or int, optional + Name or position of the index level to use (if the index is a + `MultiIndex`). - else: # tie out the order with other - if level == 0: # outer most level, take the fast route - ngroups = 1 + new_lev_labels.max() - left_indexer, counts = libalgos.groupsort_indexer( - new_lev_labels, ngroups) + Returns + ------- + is_contained : ndarray + NumPy array of boolean values. - # missing values are placed first; drop them! - left_indexer = left_indexer[counts[0]:] - new_labels = [lab[left_indexer] for lab in new_labels] + See Also + -------- + Series.isin : Same for Series. + DataFrame.isin : Same method for DataFrames. - else: # sort the leaves - mask = new_lev_labels != -1 - mask_all = mask.all() - if not mask_all: - new_labels = [lab[mask] for lab in new_labels] + Notes + ----- + In the case of `MultiIndex` you must either specify `values` as a + list-like object containing tuples that are the same length as the + number of levels, or specify `level`. Otherwise it will raise a + ``ValueError``. - left_indexer = _get_leaf_sorter(new_labels[:level + 1]) - new_labels = [lab[left_indexer] for lab in new_labels] + If `level` is specified: - # left_indexers are w.r.t masked frame. - # reverse to original frame! - if not mask_all: - left_indexer = mask.nonzero()[0][left_indexer] + - if it is the name of one *and only one* index level, use that level; + - otherwise it should be a number indicating level position. - join_index = MultiIndex(levels=new_levels, labels=new_labels, - names=left.names, verify_integrity=False) + Examples + -------- + >>> idx = pd.Index([1,2,3]) + >>> idx + Int64Index([1, 2, 3], dtype='int64') - if right_lev_indexer is not None: - right_indexer = algos.take_nd(right_lev_indexer, - join_index.labels[level], - allow_fill=False) - else: - right_indexer = join_index.labels[level] + Check whether each index value in a list of values. + >>> idx.isin([1, 4]) + array([ True, False, False]) - if flip_order: - left_indexer, right_indexer = right_indexer, left_indexer + >>> midx = pd.MultiIndex.from_arrays([[1,2,3], + ... ['red', 'blue', 'green']], + ... names=('number', 'color')) + >>> midx + MultiIndex(levels=[[1, 2, 3], ['blue', 'green', 'red']], + labels=[[0, 1, 2], [2, 0, 1]], + names=['number', 'color']) + + Check whether the strings in the 'color' level of the MultiIndex + are in a list of colors. - if return_indexers: - left_indexer = (None if left_indexer is None - else ensure_platform_int(left_indexer)) - right_indexer = (None if right_indexer is None - else ensure_platform_int(right_indexer)) - return join_index, left_indexer, right_indexer - else: - return join_index + >>> midx.isin(['red', 'orange', 'yellow'], level='color') + array([ True, False, False]) - def _join_monotonic(self, other, how='left', return_indexers=False): - if self.equals(other): - ret_index = other if how == 'right' else self - if return_indexers: - return ret_index, None, None - else: - return ret_index + To check across the levels of a MultiIndex, pass a list of tuples: - sv = self._ndarray_values - ov = other._ndarray_values + >>> midx.isin([(1, 'red'), (3, 'red')]) + array([ True, False, False]) - if self.is_unique and other.is_unique: - # We can perform much better than the general case - if how == 'left': - join_index = self - lidx = None - ridx = self._left_indexer_unique(sv, ov) - elif how == 'right': - join_index = other - lidx = self._left_indexer_unique(ov, sv) - ridx = None - elif how == 'inner': - join_index, lidx, ridx = self._inner_indexer(sv, ov) - join_index = self._wrap_joined_index(join_index, other) - elif how == 'outer': - join_index, lidx, ridx = self._outer_indexer(sv, ov) - join_index = self._wrap_joined_index(join_index, other) - else: - if how == 'left': - join_index, lidx, ridx = self._left_indexer(sv, ov) - elif how == 'right': - join_index, ridx, lidx = self._left_indexer(ov, sv) - elif how == 'inner': - join_index, lidx, ridx = self._inner_indexer(sv, ov) - elif how == 'outer': - join_index, lidx, ridx = self._outer_indexer(sv, ov) - join_index = self._wrap_joined_index(join_index, other) + For a DatetimeIndex, string values in `values` are converted to + Timestamps. - if return_indexers: - lidx = None if lidx is None else ensure_platform_int(lidx) - ridx = None if ridx is None else ensure_platform_int(ridx) - return join_index, lidx, ridx - else: - return join_index + >>> dates = ['2000-03-11', '2000-03-12', '2000-03-13'] + >>> dti = pd.to_datetime(dates) + >>> dti + DatetimeIndex(['2000-03-11', '2000-03-12', '2000-03-13'], + dtype='datetime64[ns]', freq=None) - def _wrap_joined_index(self, joined, other): - name = get_op_result_name(self, other) - return Index(joined, name=name) + >>> dti.isin(['2000-03-11']) + array([ True, False, False]) + """ + if level is not None: + self._validate_index_level(level) + return algos.isin(self, values) def _get_string_slice(self, key, use_lhs=True, use_rhs=True): # this is for partial string indexing, @@ -4630,190 +4870,8 @@ def drop(self, labels, errors='raise'): indexer = indexer[~mask] return self.delete(indexer) - _index_shared_docs['index_unique'] = ( - """ - Return unique values in the index. Uniques are returned in order - of appearance, this does NOT sort. - - Parameters - ---------- - level : int or str, optional, default None - Only return values from specified level (for MultiIndex) - - .. versionadded:: 0.23.0 - - Returns - ------- - Index without duplicates - - See Also - -------- - unique - Series.unique - """) - - @Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs) - def unique(self, level=None): - if level is not None: - self._validate_index_level(level) - result = super(Index, self).unique() - return self._shallow_copy(result) - - def drop_duplicates(self, keep='first'): - """ - Return Index with duplicate values removed. - - Parameters - ---------- - keep : {'first', 'last', ``False``}, default 'first' - - 'first' : Drop duplicates except for the first occurrence. - - 'last' : Drop duplicates except for the last occurrence. - - ``False`` : Drop all duplicates. - - Returns - ------- - deduplicated : Index - - See Also - -------- - Series.drop_duplicates : Equivalent method on Series. - DataFrame.drop_duplicates : Equivalent method on DataFrame. - Index.duplicated : Related method on Index, indicating duplicate - Index values. - - Examples - -------- - Generate an pandas.Index with duplicate values. - - >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo']) - - The `keep` parameter controls which duplicate values are removed. - The value 'first' keeps the first occurrence for each - set of duplicated entries. The default value of keep is 'first'. - - >>> idx.drop_duplicates(keep='first') - Index(['lama', 'cow', 'beetle', 'hippo'], dtype='object') - - The value 'last' keeps the last occurrence for each set of duplicated - entries. - - >>> idx.drop_duplicates(keep='last') - Index(['cow', 'beetle', 'lama', 'hippo'], dtype='object') - - The value ``False`` discards all sets of duplicated entries. - - >>> idx.drop_duplicates(keep=False) - Index(['cow', 'beetle', 'hippo'], dtype='object') - """ - return super(Index, self).drop_duplicates(keep=keep) - - def duplicated(self, keep='first'): - """ - Indicate duplicate index values. - - Duplicated values are indicated as ``True`` values in the resulting - array. Either all duplicates, all except the first, or all except the - last occurrence of duplicates can be indicated. - - Parameters - ---------- - keep : {'first', 'last', False}, default 'first' - The value or values in a set of duplicates to mark as missing. - - - 'first' : Mark duplicates as ``True`` except for the first - occurrence. - - 'last' : Mark duplicates as ``True`` except for the last - occurrence. - - ``False`` : Mark all duplicates as ``True``. - - Examples - -------- - By default, for each set of duplicated values, the first occurrence is - set to False and all others to True: - - >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama']) - >>> idx.duplicated() - array([False, False, True, False, True]) - - which is equivalent to - - >>> idx.duplicated(keep='first') - array([False, False, True, False, True]) - - By using 'last', the last occurrence of each set of duplicated values - is set on False and all others on True: - - >>> idx.duplicated(keep='last') - array([ True, False, True, False, False]) - - By setting keep on ``False``, all duplicates are True: - - >>> idx.duplicated(keep=False) - array([ True, False, True, False, True]) - - Returns - ------- - numpy.ndarray - - See Also - -------- - pandas.Series.duplicated : Equivalent method on pandas.Series. - pandas.DataFrame.duplicated : Equivalent method on pandas.DataFrame. - pandas.Index.drop_duplicates : Remove duplicate values from Index. - """ - return super(Index, self).duplicated(keep=keep) - - _index_shared_docs['fillna'] = """ - Fill NA/NaN values with the specified value - - Parameters - ---------- - value : scalar - Scalar value to use to fill holes (e.g. 0). - This value cannot be a list-likes. - downcast : dict, default is None - a dict of item->dtype of what to downcast if possible, - or the string 'infer' which will try to downcast to an appropriate - equal type (e.g. float64 to int64 if possible) - - Returns - ------- - filled : %(klass)s - """ - - @Appender(_index_shared_docs['fillna']) - def fillna(self, value=None, downcast=None): - self._assert_can_do_op(value) - if self.hasnans: - result = self.putmask(self._isnan, value) - if downcast is None: - # no need to care metadata other than name - # because it can't have freq if - return Index(result, name=self.name) - return self._shallow_copy() - - _index_shared_docs['dropna'] = """ - Return Index without NA/NaN values - - Parameters - ---------- - how : {'any', 'all'}, default 'any' - If the Index is a MultiIndex, drop the value when any or all levels - are NaN. - - Returns - ------- - valid : Index - """ - - @Appender(_index_shared_docs['dropna']) - def dropna(self, how='any'): - if how not in ('any', 'all'): - raise ValueError("invalid how option: {0}".format(how)) - - if self.hasnans: - return self._shallow_copy(self.values[~self._isnan]) - return self._shallow_copy() + # -------------------------------------------------------------------- + # Generated Arithmetic, Comparison, and Unary Methods def _evaluate_with_timedelta_like(self, other, op): # Timedelta knows how to operate with np.array, so dispatch to that diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index f05b0fdd4a323..6b84e8deea493 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -94,6 +94,9 @@ def _engine_type(self): _attributes = ['name'] + # -------------------------------------------------------------------- + # Constructors + def __new__(cls, data=None, categories=None, ordered=None, dtype=None, copy=False, name=None, fastpath=None): @@ -212,6 +215,8 @@ def _simple_new(cls, values, name=None, categories=None, ordered=None, result._reset_identity() return result + # -------------------------------------------------------------------- + @Appender(_index_shared_docs['_shallow_copy']) def _shallow_copy(self, values=None, categories=None, ordered=None, dtype=None, **kwargs): @@ -284,6 +289,9 @@ def equals(self, other): return False + # -------------------------------------------------------------------- + # Rendering Methods + @property def _formatter_func(self): return self.categories._formatter_func @@ -307,6 +315,8 @@ def _format_attrs(self): attrs.append(('length', len(self))) return attrs + # -------------------------------------------------------------------- + @property def inferred_type(self): return 'categorical' diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 0e2f7ceb24e94..5e25efe77d8b9 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -331,9 +331,6 @@ def _box_values_as_index(self): from pandas.core.index import Index return Index(self._box_values(self.asi8), name=self.name, dtype=object) - def _format_with_header(self, header, **kwargs): - return header + list(self._format_native_types(**kwargs)) - @Appender(_index_shared_docs['__contains__'] % _index_doc_kwargs) def __contains__(self, key): try: @@ -544,6 +541,12 @@ def argmax(self, axis=None, *args, **kwargs): i8[mask] = 0 return i8.argmax() + # -------------------------------------------------------------------- + # Rendering Methods + + def _format_with_header(self, header, **kwargs): + return header + list(self._format_native_types(**kwargs)) + @property def _formatter_func(self): raise AbstractMethodError(self) @@ -561,6 +564,8 @@ def _format_attrs(self): attrs.append(('freq', freq)) return attrs + # -------------------------------------------------------------------- + def _convert_scalar_indexer(self, key, kind=None): """ We don't allow integer or float indexing on datetime-like when using diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 8b563a9b9bed0..16c1e22d40017 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -372,22 +372,12 @@ def nbytes(self): # for TZ-aware return self._ndarray_values.nbytes - def _mpl_repr(self): - # how to represent ourselves to matplotlib - return libts.ints_to_pydatetime(self.asi8, self.tz) - @cache_readonly def _is_dates_only(self): """Return a boolean if we are only dates (and don't have a timezone)""" from pandas.io.formats.format import _is_dates_only return _is_dates_only(self.values) and self.tz is None - @property - def _formatter_func(self): - from pandas.io.formats.format import _get_format_datetime64 - formatter = _get_format_datetime64(is_dates_only=self._is_dates_only) - return lambda x: "'%s'" % formatter(x, tz=self.tz) - def __reduce__(self): # we use a special reudce here because we need @@ -439,6 +429,13 @@ def _maybe_update_attributes(self, attrs): attrs['freq'] = 'infer' return attrs + # -------------------------------------------------------------------- + # Rendering Methods + + def _mpl_repr(self): + # how to represent ourselves to matplotlib + return libts.ints_to_pydatetime(self.asi8, self.tz) + def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs): from pandas.io.formats.format import _get_format_datetime64_from_values format = _get_format_datetime64_from_values(self, date_format) @@ -448,124 +445,14 @@ def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs): format=format, na_rep=na_rep) - @Appender(_index_shared_docs['astype']) - def astype(self, dtype, copy=True): - dtype = pandas_dtype(dtype) - if (is_datetime64_ns_dtype(dtype) and - not is_dtype_equal(dtype, self.dtype)): - # GH 18951: datetime64_ns dtype but not equal means different tz - new_tz = getattr(dtype, 'tz', None) - if getattr(self.dtype, 'tz', None) is None: - return self.tz_localize(new_tz) - return self.tz_convert(new_tz) - elif is_period_dtype(dtype): - return self.to_period(freq=dtype.freq) - return super(DatetimeIndex, self).astype(dtype, copy=copy) - - def _get_time_micros(self): - values = self.asi8 - if self.tz is not None and not timezones.is_utc(self.tz): - values = self._local_timestamps() - return fields.get_time_micros(values) - - def to_series(self, keep_tz=None, index=None, name=None): - """ - Create a Series with both index and values equal to the index keys - useful with map for returning an indexer based on an index - - Parameters - ---------- - keep_tz : optional, defaults False - Return the data keeping the timezone. - - If keep_tz is True: - - If the timezone is not set, the resulting - Series will have a datetime64[ns] dtype. - - Otherwise the Series will have an datetime64[ns, tz] dtype; the - tz will be preserved. - - If keep_tz is False: - - Series will have a datetime64[ns] dtype. TZ aware - objects will have the tz removed. - - .. versionchanged:: 0.24 - The default value will change to True in a future release. - You can set ``keep_tz=True`` to already obtain the future - behaviour and silence the warning. - - index : Index, optional - index of resulting Series. If None, defaults to original index - name : string, optional - name of resulting Series. If None, defaults to name of original - index - - Returns - ------- - Series - """ - from pandas import Series - - if index is None: - index = self._shallow_copy() - if name is None: - name = self.name - - if keep_tz is None and self.tz is not None: - warnings.warn("The default of the 'keep_tz' keyword will change " - "to True in a future release. You can set " - "'keep_tz=True' to obtain the future behaviour and " - "silence this warning.", FutureWarning, stacklevel=2) - keep_tz = False - elif keep_tz is False: - warnings.warn("Specifying 'keep_tz=False' is deprecated and this " - "option will be removed in a future release. If " - "you want to remove the timezone information, you " - "can do 'idx.tz_convert(None)' before calling " - "'to_series'.", FutureWarning, stacklevel=2) - - if keep_tz and self.tz is not None: - # preserve the tz & copy - values = self.copy(deep=True) - else: - values = self.values.copy() - - return Series(values, index=index, name=name) - - def snap(self, freq='S'): - """ - Snap time stamps to nearest occurring frequency - """ - # Superdumb, punting on any optimizing - freq = to_offset(freq) - - snapped = np.empty(len(self), dtype=_NS_DTYPE) - - for i, v in enumerate(self): - s = v - if not freq.onOffset(s): - t0 = freq.rollback(s) - t1 = freq.rollforward(s) - if abs(s - t0) < abs(t1 - s): - s = t0 - else: - s = t1 - snapped[i] = s - - # we know it conforms; skip check - return DatetimeIndex(snapped, freq=freq, verify_integrity=False) - # TODO: what about self.name? if so, use shallow_copy? - - def unique(self, level=None): - if level is not None: - self._validate_index_level(level) + @property + def _formatter_func(self): + from pandas.io.formats.format import _get_format_datetime64 + formatter = _get_format_datetime64(is_dates_only=self._is_dates_only) + return lambda x: "'%s'" % formatter(x, tz=self.tz) - # TODO(DatetimeArray): change dispatch once inheritance is removed - # call DatetimeArray method - result = DatetimeArray.unique(self) - return self._shallow_copy(result._data) + # -------------------------------------------------------------------- + # Set Operation Methods def union(self, other): """ @@ -634,51 +521,6 @@ def union_many(self, others): return this - def join(self, other, how='left', level=None, return_indexers=False, - sort=False): - """ - See Index.join - """ - if (not isinstance(other, DatetimeIndex) and len(other) > 0 and - other.inferred_type not in ('floating', 'integer', 'mixed-integer', - 'mixed-integer-float', 'mixed')): - try: - other = DatetimeIndex(other) - except (TypeError, ValueError): - pass - - this, other = self._maybe_utc_convert(other) - return Index.join(this, other, how=how, level=level, - return_indexers=return_indexers, sort=sort) - - def _maybe_utc_convert(self, other): - this = self - if isinstance(other, DatetimeIndex): - if self.tz is not None: - if other.tz is None: - raise TypeError('Cannot join tz-naive with tz-aware ' - 'DatetimeIndex') - elif other.tz is not None: - raise TypeError('Cannot join tz-naive with tz-aware ' - 'DatetimeIndex') - - if not timezones.tz_compare(self.tz, other.tz): - this = self.tz_convert('UTC') - other = other.tz_convert('UTC') - return this, other - - def _wrap_joined_index(self, joined, other): - name = get_op_result_name(self, other) - if (isinstance(other, DatetimeIndex) and - self.freq == other.freq and - self._can_fast_union(other)): - joined = self._shallow_copy(joined) - joined.name = name - return joined - else: - tz = getattr(other, 'tz', None) - return self._simple_new(joined, name, tz=tz) - def _can_fast_union(self, other): if not isinstance(other, DatetimeIndex): return False @@ -805,6 +647,172 @@ def intersection(self, other): left_chunk = left.values[lslice] return self._shallow_copy(left_chunk) + # -------------------------------------------------------------------- + + @Appender(_index_shared_docs['astype']) + def astype(self, dtype, copy=True): + dtype = pandas_dtype(dtype) + if (is_datetime64_ns_dtype(dtype) and + not is_dtype_equal(dtype, self.dtype)): + # GH 18951: datetime64_ns dtype but not equal means different tz + new_tz = getattr(dtype, 'tz', None) + if getattr(self.dtype, 'tz', None) is None: + return self.tz_localize(new_tz) + return self.tz_convert(new_tz) + elif is_period_dtype(dtype): + return self.to_period(freq=dtype.freq) + return super(DatetimeIndex, self).astype(dtype, copy=copy) + + def _get_time_micros(self): + values = self.asi8 + if self.tz is not None and not timezones.is_utc(self.tz): + values = self._local_timestamps() + return fields.get_time_micros(values) + + def to_series(self, keep_tz=None, index=None, name=None): + """ + Create a Series with both index and values equal to the index keys + useful with map for returning an indexer based on an index + + Parameters + ---------- + keep_tz : optional, defaults False + Return the data keeping the timezone. + + If keep_tz is True: + + If the timezone is not set, the resulting + Series will have a datetime64[ns] dtype. + + Otherwise the Series will have an datetime64[ns, tz] dtype; the + tz will be preserved. + + If keep_tz is False: + + Series will have a datetime64[ns] dtype. TZ aware + objects will have the tz removed. + + .. versionchanged:: 0.24 + The default value will change to True in a future release. + You can set ``keep_tz=True`` to already obtain the future + behaviour and silence the warning. + + index : Index, optional + index of resulting Series. If None, defaults to original index + name : string, optional + name of resulting Series. If None, defaults to name of original + index + + Returns + ------- + Series + """ + from pandas import Series + + if index is None: + index = self._shallow_copy() + if name is None: + name = self.name + + if keep_tz is None and self.tz is not None: + warnings.warn("The default of the 'keep_tz' keyword will change " + "to True in a future release. You can set " + "'keep_tz=True' to obtain the future behaviour and " + "silence this warning.", FutureWarning, stacklevel=2) + keep_tz = False + elif keep_tz is False: + warnings.warn("Specifying 'keep_tz=False' is deprecated and this " + "option will be removed in a future release. If " + "you want to remove the timezone information, you " + "can do 'idx.tz_convert(None)' before calling " + "'to_series'.", FutureWarning, stacklevel=2) + + if keep_tz and self.tz is not None: + # preserve the tz & copy + values = self.copy(deep=True) + else: + values = self.values.copy() + + return Series(values, index=index, name=name) + + def snap(self, freq='S'): + """ + Snap time stamps to nearest occurring frequency + """ + # Superdumb, punting on any optimizing + freq = to_offset(freq) + + snapped = np.empty(len(self), dtype=_NS_DTYPE) + + for i, v in enumerate(self): + s = v + if not freq.onOffset(s): + t0 = freq.rollback(s) + t1 = freq.rollforward(s) + if abs(s - t0) < abs(t1 - s): + s = t0 + else: + s = t1 + snapped[i] = s + + # we know it conforms; skip check + return DatetimeIndex(snapped, freq=freq, verify_integrity=False) + # TODO: what about self.name? if so, use shallow_copy? + + def unique(self, level=None): + if level is not None: + self._validate_index_level(level) + + # TODO(DatetimeArray): change dispatch once inheritance is removed + # call DatetimeArray method + result = DatetimeArray.unique(self) + return self._shallow_copy(result._data) + + def join(self, other, how='left', level=None, return_indexers=False, + sort=False): + """ + See Index.join + """ + if (not isinstance(other, DatetimeIndex) and len(other) > 0 and + other.inferred_type not in ('floating', 'integer', 'mixed-integer', + 'mixed-integer-float', 'mixed')): + try: + other = DatetimeIndex(other) + except (TypeError, ValueError): + pass + + this, other = self._maybe_utc_convert(other) + return Index.join(this, other, how=how, level=level, + return_indexers=return_indexers, sort=sort) + + def _maybe_utc_convert(self, other): + this = self + if isinstance(other, DatetimeIndex): + if self.tz is not None: + if other.tz is None: + raise TypeError('Cannot join tz-naive with tz-aware ' + 'DatetimeIndex') + elif other.tz is not None: + raise TypeError('Cannot join tz-naive with tz-aware ' + 'DatetimeIndex') + + if not timezones.tz_compare(self.tz, other.tz): + this = self.tz_convert('UTC') + other = other.tz_convert('UTC') + return this, other + + def _wrap_joined_index(self, joined, other): + name = get_op_result_name(self, other) + if (isinstance(other, DatetimeIndex) and + self.freq == other.freq and + self._can_fast_union(other)): + joined = self._shallow_copy(joined) + joined.name = name + return joined + else: + tz = getattr(other, 'tz', None) + return self._simple_new(joined, name, tz=tz) + def _parsed_string_to_bounds(self, reso, parsed): """ Calculate datetime bounds for parsed time string and its resolution. diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 1ebcf213ab0eb..5ee6a816d91f5 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -137,6 +137,9 @@ class IntervalIndex(IntervalMixin, Index): # Immutable, so we are able to cache computations like isna in '_mask' _mask = None + # -------------------------------------------------------------------- + # Constructors + def __new__(cls, data, closed=None, dtype=None, copy=False, name=None, verify_integrity=True): @@ -168,6 +171,50 @@ def _simple_new(cls, array, name, closed=None): result._reset_identity() return result + @classmethod + @Appender(_interval_shared_docs['from_breaks'] % _index_doc_kwargs) + def from_breaks(cls, breaks, closed='right', name=None, copy=False, + dtype=None): + with rewrite_exception("IntervalArray", cls.__name__): + array = IntervalArray.from_breaks(breaks, closed=closed, copy=copy, + dtype=dtype) + return cls._simple_new(array, name=name) + + @classmethod + @Appender(_interval_shared_docs['from_arrays'] % _index_doc_kwargs) + def from_arrays(cls, left, right, closed='right', name=None, copy=False, + dtype=None): + with rewrite_exception("IntervalArray", cls.__name__): + array = IntervalArray.from_arrays(left, right, closed, copy=copy, + dtype=dtype) + return cls._simple_new(array, name=name) + + @classmethod + @Appender(_interval_shared_docs['from_intervals'] % _index_doc_kwargs) + def from_intervals(cls, data, closed=None, name=None, copy=False, + dtype=None): + msg = ('IntervalIndex.from_intervals is deprecated and will be ' + 'removed in a future version; Use IntervalIndex(...) instead') + warnings.warn(msg, FutureWarning, stacklevel=2) + with rewrite_exception("IntervalArray", cls.__name__): + array = IntervalArray(data, closed=closed, copy=copy, dtype=dtype) + + if name is None and isinstance(data, cls): + name = data.name + + return cls._simple_new(array, name=name) + + @classmethod + @Appender(_interval_shared_docs['from_tuples'] % _index_doc_kwargs) + def from_tuples(cls, data, closed='right', name=None, copy=False, + dtype=None): + with rewrite_exception("IntervalArray", cls.__name__): + arr = IntervalArray.from_tuples(data, closed=closed, copy=copy, + dtype=dtype) + return cls._simple_new(arr, name=name) + + # -------------------------------------------------------------------- + @Appender(_index_shared_docs['_shallow_copy']) def _shallow_copy(self, left=None, right=None, **kwargs): result = self._data._shallow_copy(left=left, right=right) @@ -231,48 +278,6 @@ def contains(self, key): except KeyError: return False - @classmethod - @Appender(_interval_shared_docs['from_breaks'] % _index_doc_kwargs) - def from_breaks(cls, breaks, closed='right', name=None, copy=False, - dtype=None): - with rewrite_exception("IntervalArray", cls.__name__): - array = IntervalArray.from_breaks(breaks, closed=closed, copy=copy, - dtype=dtype) - return cls._simple_new(array, name=name) - - @classmethod - @Appender(_interval_shared_docs['from_arrays'] % _index_doc_kwargs) - def from_arrays(cls, left, right, closed='right', name=None, copy=False, - dtype=None): - with rewrite_exception("IntervalArray", cls.__name__): - array = IntervalArray.from_arrays(left, right, closed, copy=copy, - dtype=dtype) - return cls._simple_new(array, name=name) - - @classmethod - @Appender(_interval_shared_docs['from_intervals'] % _index_doc_kwargs) - def from_intervals(cls, data, closed=None, name=None, copy=False, - dtype=None): - msg = ('IntervalIndex.from_intervals is deprecated and will be ' - 'removed in a future version; Use IntervalIndex(...) instead') - warnings.warn(msg, FutureWarning, stacklevel=2) - with rewrite_exception("IntervalArray", cls.__name__): - array = IntervalArray(data, closed=closed, copy=copy, dtype=dtype) - - if name is None and isinstance(data, cls): - name = data.name - - return cls._simple_new(array, name=name) - - @classmethod - @Appender(_interval_shared_docs['from_tuples'] % _index_doc_kwargs) - def from_tuples(cls, data, closed='right', name=None, copy=False, - dtype=None): - with rewrite_exception("IntervalArray", cls.__name__): - arr = IntervalArray.from_tuples(data, closed=closed, copy=copy, - dtype=dtype) - return cls._simple_new(arr, name=name) - @Appender(_interval_shared_docs['to_tuples'] % dict( return_type="Index", examples=""" @@ -941,6 +946,8 @@ def __getitem__(self, value): # scalar return result + # -------------------------------------------------------------------- + # Rendering Methods # __repr__ associated methods are based on MultiIndex def _format_with_header(self, header, **kwargs): @@ -997,6 +1004,8 @@ def _format_space(self): space = ' ' * (len(self.__class__.__name__) + 1) return "\n{space}".format(space=space) + # -------------------------------------------------------------------- + def argsort(self, *args, **kwargs): return np.lexsort((self.right, self.left)) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index ea6dfa6a3a6af..f03376c32f7f4 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -200,6 +200,9 @@ class MultiIndex(Index): _comparables = ['names'] rename = Index.set_names + # -------------------------------------------------------------------- + # Constructors + def __new__(cls, levels=None, labels=None, sortorder=None, names=None, dtype=None, copy=False, name=None, verify_integrity=True, _set_identity=True): @@ -275,6 +278,154 @@ def _verify_integrity(self, labels=None, levels=None): values=[value for value in level], level=i)) + @classmethod + def from_arrays(cls, arrays, sortorder=None, names=None): + """ + Convert arrays to MultiIndex + + Parameters + ---------- + arrays : list / sequence of array-likes + Each array-like gives one level's value for each data point. + len(arrays) is the number of levels. + sortorder : int or None + Level of sortedness (must be lexicographically sorted by that + level) + + Returns + ------- + index : MultiIndex + + Examples + -------- + >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] + >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) + + See Also + -------- + MultiIndex.from_tuples : Convert list of tuples to MultiIndex. + MultiIndex.from_product : Make a MultiIndex from cartesian product + of iterables. + """ + if not is_list_like(arrays): + raise TypeError("Input must be a list / sequence of array-likes.") + elif is_iterator(arrays): + arrays = list(arrays) + + # Check if lengths of all arrays are equal or not, + # raise ValueError, if not + for i in range(1, len(arrays)): + if len(arrays[i]) != len(arrays[i - 1]): + raise ValueError('all arrays must be same length') + + from pandas.core.arrays.categorical import _factorize_from_iterables + + labels, levels = _factorize_from_iterables(arrays) + if names is None: + names = [getattr(arr, "name", None) for arr in arrays] + + return MultiIndex(levels=levels, labels=labels, sortorder=sortorder, + names=names, verify_integrity=False) + + @classmethod + def from_tuples(cls, tuples, sortorder=None, names=None): + """ + Convert list of tuples to MultiIndex + + Parameters + ---------- + tuples : list / sequence of tuple-likes + Each tuple is the index of one row/column. + sortorder : int or None + Level of sortedness (must be lexicographically sorted by that + level) + + Returns + ------- + index : MultiIndex + + Examples + -------- + >>> tuples = [(1, u'red'), (1, u'blue'), + (2, u'red'), (2, u'blue')] + >>> pd.MultiIndex.from_tuples(tuples, names=('number', 'color')) + + See Also + -------- + MultiIndex.from_arrays : Convert list of arrays to MultiIndex + MultiIndex.from_product : Make a MultiIndex from cartesian product + of iterables + """ + if not is_list_like(tuples): + raise TypeError('Input must be a list / sequence of tuple-likes.') + elif is_iterator(tuples): + tuples = list(tuples) + + if len(tuples) == 0: + if names is None: + msg = 'Cannot infer number of levels from empty list' + raise TypeError(msg) + arrays = [[]] * len(names) + elif isinstance(tuples, (np.ndarray, Index)): + if isinstance(tuples, Index): + tuples = tuples._values + + arrays = list(lib.tuples_to_object_array(tuples).T) + elif isinstance(tuples, list): + arrays = list(lib.to_object_array_tuples(tuples).T) + else: + arrays = lzip(*tuples) + + return MultiIndex.from_arrays(arrays, sortorder=sortorder, names=names) + + @classmethod + def from_product(cls, iterables, sortorder=None, names=None): + """ + Make a MultiIndex from the cartesian product of multiple iterables + + Parameters + ---------- + iterables : list / sequence of iterables + Each iterable has unique labels for each level of the index. + sortorder : int or None + Level of sortedness (must be lexicographically sorted by that + level). + names : list / sequence of strings or None + Names for the levels in the index. + + Returns + ------- + index : MultiIndex + + Examples + -------- + >>> numbers = [0, 1, 2] + >>> colors = [u'green', u'purple'] + >>> pd.MultiIndex.from_product([numbers, colors], + names=['number', 'color']) + MultiIndex(levels=[[0, 1, 2], [u'green', u'purple']], + labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], + names=[u'number', u'color']) + + See Also + -------- + MultiIndex.from_arrays : Convert list of arrays to MultiIndex. + MultiIndex.from_tuples : Convert list of tuples to MultiIndex. + """ + from pandas.core.arrays.categorical import _factorize_from_iterables + from pandas.core.reshape.util import cartesian_product + + if not is_list_like(iterables): + raise TypeError("Input must be a list / sequence of iterables.") + elif is_iterator(iterables): + iterables = list(iterables) + + labels, levels = _factorize_from_iterables(iterables) + labels = cartesian_product(labels) + return MultiIndex(levels, labels, sortorder=sortorder, names=names) + + # -------------------------------------------------------------------- + @property def levels(self): return self._levels @@ -622,6 +773,9 @@ def _nbytes(self, deep=False): result += self._engine.sizeof(deep=deep) return result + # -------------------------------------------------------------------- + # Rendering Methods + def _format_attrs(self): """ Return a list of tuples of the (attr,formatted_value) @@ -644,6 +798,94 @@ def _format_data(self, name=None): # we are formatting thru the attributes return None + def _format_native_types(self, na_rep='nan', **kwargs): + new_levels = [] + new_labels = [] + + # go through the levels and format them + for level, label in zip(self.levels, self.labels): + level = level._format_native_types(na_rep=na_rep, **kwargs) + # add nan values, if there are any + mask = (label == -1) + if mask.any(): + nan_index = len(level) + level = np.append(level, na_rep) + label = label.values() + label[mask] = nan_index + new_levels.append(level) + new_labels.append(label) + + if len(new_levels) == 1: + return Index(new_levels[0])._format_native_types() + else: + # reconstruct the multi-index + mi = MultiIndex(levels=new_levels, labels=new_labels, + names=self.names, sortorder=self.sortorder, + verify_integrity=False) + return mi.values + + def format(self, space=2, sparsify=None, adjoin=True, names=False, + na_rep=None, formatter=None): + if len(self) == 0: + return [] + + stringified_levels = [] + for lev, lab in zip(self.levels, self.labels): + na = na_rep if na_rep is not None else _get_na_rep(lev.dtype.type) + + if len(lev) > 0: + + formatted = lev.take(lab).format(formatter=formatter) + + # we have some NA + mask = lab == -1 + if mask.any(): + formatted = np.array(formatted, dtype=object) + formatted[mask] = na + formatted = formatted.tolist() + + else: + # weird all NA case + formatted = [pprint_thing(na if isna(x) else x, + escape_chars=('\t', '\r', '\n')) + for x in algos.take_1d(lev._values, lab)] + stringified_levels.append(formatted) + + result_levels = [] + for lev, name in zip(stringified_levels, self.names): + level = [] + + if names: + level.append(pprint_thing(name, + escape_chars=('\t', '\r', '\n')) + if name is not None else '') + + level.extend(np.array(lev, dtype=object)) + result_levels.append(level) + + if sparsify is None: + sparsify = get_option("display.multi_sparse") + + if sparsify: + sentinel = '' + # GH3547 + # use value of sparsify as sentinel, unless it's an obvious + # "Truthey" value + if sparsify not in [True, 1]: + sentinel = sparsify + # little bit of a kludge job for #1217 + result_levels = _sparsify(result_levels, start=int(names), + sentinel=sentinel) + + if adjoin: + from pandas.io.formats.format import _get_adjustment + adj = _get_adjustment() + return adj.adjoin(space, *result_levels).split('\n') + else: + return result_levels + + # -------------------------------------------------------------------- + def __len__(self): return len(self.labels[0]) @@ -705,32 +947,6 @@ def _set_names(self, names, level=None, validate=True): names = property(fset=_set_names, fget=_get_names, doc="Names of levels in MultiIndex") - def _format_native_types(self, na_rep='nan', **kwargs): - new_levels = [] - new_labels = [] - - # go through the levels and format them - for level, label in zip(self.levels, self.labels): - level = level._format_native_types(na_rep=na_rep, **kwargs) - # add nan values, if there are any - mask = (label == -1) - if mask.any(): - nan_index = len(level) - level = np.append(level, na_rep) - label = label.values() - label[mask] = nan_index - new_levels.append(level) - new_labels.append(label) - - if len(new_levels) == 1: - return Index(new_levels[0])._format_native_types() - else: - # reconstruct the multi-index - mi = MultiIndex(levels=new_levels, labels=new_labels, - names=self.names, sortorder=self.sortorder, - verify_integrity=False) - return mi.values - @Appender(_index_shared_docs['_get_grouper_for_level']) def _get_grouper_for_level(self, mapper, level): indexer = self.labels[level] @@ -1081,66 +1297,6 @@ def unique(self, level=None): level = self._get_level_number(level) return self._get_level_values(level=level, unique=True) - def format(self, space=2, sparsify=None, adjoin=True, names=False, - na_rep=None, formatter=None): - if len(self) == 0: - return [] - - stringified_levels = [] - for lev, lab in zip(self.levels, self.labels): - na = na_rep if na_rep is not None else _get_na_rep(lev.dtype.type) - - if len(lev) > 0: - - formatted = lev.take(lab).format(formatter=formatter) - - # we have some NA - mask = lab == -1 - if mask.any(): - formatted = np.array(formatted, dtype=object) - formatted[mask] = na - formatted = formatted.tolist() - - else: - # weird all NA case - formatted = [pprint_thing(na if isna(x) else x, - escape_chars=('\t', '\r', '\n')) - for x in algos.take_1d(lev._values, lab)] - stringified_levels.append(formatted) - - result_levels = [] - for lev, name in zip(stringified_levels, self.names): - level = [] - - if names: - level.append(pprint_thing(name, - escape_chars=('\t', '\r', '\n')) - if name is not None else '') - - level.extend(np.array(lev, dtype=object)) - result_levels.append(level) - - if sparsify is None: - sparsify = get_option("display.multi_sparse") - - if sparsify: - sentinel = '' - # GH3547 - # use value of sparsify as sentinel, unless it's an obvious - # "Truthey" value - if sparsify not in [True, 1]: - sentinel = sparsify - # little bit of a kludge job for #1217 - result_levels = _sparsify(result_levels, start=int(names), - sentinel=sentinel) - - if adjoin: - from pandas.io.formats.format import _get_adjustment - adj = _get_adjustment() - return adj.adjoin(space, *result_levels).split('\n') - else: - return result_levels - def _to_safe_for_reshape(self): """ convert to object if we are a categorical """ return self.set_levels([i._to_safe_for_reshape() for i in self.levels]) @@ -1289,152 +1445,6 @@ def lexsort_depth(self): return 0 - @classmethod - def from_arrays(cls, arrays, sortorder=None, names=None): - """ - Convert arrays to MultiIndex - - Parameters - ---------- - arrays : list / sequence of array-likes - Each array-like gives one level's value for each data point. - len(arrays) is the number of levels. - sortorder : int or None - Level of sortedness (must be lexicographically sorted by that - level) - - Returns - ------- - index : MultiIndex - - Examples - -------- - >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] - >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) - - See Also - -------- - MultiIndex.from_tuples : Convert list of tuples to MultiIndex. - MultiIndex.from_product : Make a MultiIndex from cartesian product - of iterables. - """ - if not is_list_like(arrays): - raise TypeError("Input must be a list / sequence of array-likes.") - elif is_iterator(arrays): - arrays = list(arrays) - - # Check if lengths of all arrays are equal or not, - # raise ValueError, if not - for i in range(1, len(arrays)): - if len(arrays[i]) != len(arrays[i - 1]): - raise ValueError('all arrays must be same length') - - from pandas.core.arrays.categorical import _factorize_from_iterables - - labels, levels = _factorize_from_iterables(arrays) - if names is None: - names = [getattr(arr, "name", None) for arr in arrays] - - return MultiIndex(levels=levels, labels=labels, sortorder=sortorder, - names=names, verify_integrity=False) - - @classmethod - def from_tuples(cls, tuples, sortorder=None, names=None): - """ - Convert list of tuples to MultiIndex - - Parameters - ---------- - tuples : list / sequence of tuple-likes - Each tuple is the index of one row/column. - sortorder : int or None - Level of sortedness (must be lexicographically sorted by that - level) - - Returns - ------- - index : MultiIndex - - Examples - -------- - >>> tuples = [(1, u'red'), (1, u'blue'), - (2, u'red'), (2, u'blue')] - >>> pd.MultiIndex.from_tuples(tuples, names=('number', 'color')) - - See Also - -------- - MultiIndex.from_arrays : Convert list of arrays to MultiIndex - MultiIndex.from_product : Make a MultiIndex from cartesian product - of iterables - """ - if not is_list_like(tuples): - raise TypeError('Input must be a list / sequence of tuple-likes.') - elif is_iterator(tuples): - tuples = list(tuples) - - if len(tuples) == 0: - if names is None: - msg = 'Cannot infer number of levels from empty list' - raise TypeError(msg) - arrays = [[]] * len(names) - elif isinstance(tuples, (np.ndarray, Index)): - if isinstance(tuples, Index): - tuples = tuples._values - - arrays = list(lib.tuples_to_object_array(tuples).T) - elif isinstance(tuples, list): - arrays = list(lib.to_object_array_tuples(tuples).T) - else: - arrays = lzip(*tuples) - - return MultiIndex.from_arrays(arrays, sortorder=sortorder, names=names) - - @classmethod - def from_product(cls, iterables, sortorder=None, names=None): - """ - Make a MultiIndex from the cartesian product of multiple iterables - - Parameters - ---------- - iterables : list / sequence of iterables - Each iterable has unique labels for each level of the index. - sortorder : int or None - Level of sortedness (must be lexicographically sorted by that - level). - names : list / sequence of strings or None - Names for the levels in the index. - - Returns - ------- - index : MultiIndex - - Examples - -------- - >>> numbers = [0, 1, 2] - >>> colors = [u'green', u'purple'] - >>> pd.MultiIndex.from_product([numbers, colors], - names=['number', 'color']) - MultiIndex(levels=[[0, 1, 2], [u'green', u'purple']], - labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], - names=[u'number', u'color']) - - See Also - -------- - MultiIndex.from_arrays : Convert list of arrays to MultiIndex. - MultiIndex.from_tuples : Convert list of tuples to MultiIndex. - """ - from pandas.core.arrays.categorical import _factorize_from_iterables - from pandas.core.reshape.util import cartesian_product - - if not is_list_like(iterables): - raise TypeError("Input must be a list / sequence of iterables.") - elif is_iterator(iterables): - iterables = list(iterables) - - labels, levels = _factorize_from_iterables(iterables) - labels = cartesian_product(labels) - return MultiIndex(levels, labels, sortorder=sortorder, names=names) - def _sort_levels_monotonic(self): """ .. versionadded:: 0.20.0 diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index fec3a9bd24cc8..56df454bddf1c 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -364,12 +364,6 @@ def to_timestamp(self, freq=None, how='start'): name=self.name, freq=result.freq) - def _format_native_types(self, na_rep=u'NaT', quoting=None, **kwargs): - # just dispatch, return ndarray - return self._data._format_native_types(na_rep=na_rep, - quoting=quoting, - **kwargs) - def _maybe_convert_timedelta(self, other): """ Convert timedelta-like input to an integer multiple of self.freq @@ -412,6 +406,19 @@ def _maybe_convert_timedelta(self, other): raise IncompatibleFrequency(msg.format(cls=type(self).__name__, freqstr=self.freqstr)) + # ------------------------------------------------------------------------ + # Rendering Methods + + def _format_native_types(self, na_rep=u'NaT', quoting=None, **kwargs): + # just dispatch, return ndarray + return self._data._format_native_types(na_rep=na_rep, + quoting=quoting, + **kwargs) + + def _mpl_repr(self): + # how to represent ourselves to matplotlib + return self.astype(object).values + # ------------------------------------------------------------------------ # Indexing @@ -595,10 +602,6 @@ def is_full(self): values = self.asi8 return ((values[1:] - values[:-1]) < 2).all() - def _mpl_repr(self): - # how to represent ourselves to matplotlib - return self.astype(object).values - @property def inferred_type(self): # b/c data is represented as ints make sure we can't have ambiguous diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index d6286244fcb7e..364aadb9523f0 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -25,7 +25,6 @@ class RangeIndex(Int64Index): - """ Immutable Index implementing a monotonic integer range. @@ -64,6 +63,9 @@ class RangeIndex(Int64Index): _typ = 'rangeindex' _engine_type = libindex.Int64Engine + # -------------------------------------------------------------------- + # Constructors + def __new__(cls, start=None, stop=None, step=None, dtype=None, copy=False, name=None, fastpath=None): @@ -158,6 +160,8 @@ def _simple_new(cls, start, stop=None, step=None, name=None, result._reset_identity() return result + # -------------------------------------------------------------------- + @staticmethod def _validate_dtype(dtype): """ require dtype to be None or int64 """ @@ -188,6 +192,9 @@ def __reduce__(self): d.update(dict(self._get_data_as_items())) return ibase._new_Index, (self.__class__, d), None + # -------------------------------------------------------------------- + # Rendering Methods + def _format_attrs(self): """ Return a list of tuples of the (attr, formatted_value) @@ -201,6 +208,8 @@ def _format_data(self, name=None): # we are formatting thru the attributes return None + # -------------------------------------------------------------------- + @cache_readonly def nbytes(self): """ diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 8f50b40a20738..9ceb49a60edd2 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -127,6 +127,9 @@ def _join_i8_wrapper(joinf, **kwargs): _freq = None + # ------------------------------------------------------------------- + # Constructors + def __new__(cls, data=None, unit=None, freq=None, start=None, end=None, periods=None, closed=None, dtype=None, copy=False, name=None, verify_integrity=True): @@ -193,10 +196,7 @@ def _simple_new(cls, values, name=None, freq=None, dtype=_TD_DTYPE): result._reset_identity() return result - @property - def _formatter_func(self): - from pandas.io.formats.format import _get_format_timedelta64 - return _get_format_timedelta64(self, box=True) + # ------------------------------------------------------------------- def __setstate__(self, state): """Necessary for making this object picklable""" @@ -218,6 +218,14 @@ def _evaluate_with_timedelta_like(self, other, op): result = TimedeltaArray._evaluate_with_timedelta_like(self, other, op) return wrap_arithmetic_op(self, other, result) + # ------------------------------------------------------------------- + # Rendering Methods + + @property + def _formatter_func(self): + from pandas.io.formats.format import _get_format_timedelta64 + return _get_format_timedelta64(self, box=True) + def _format_native_types(self, na_rep=u'NaT', date_format=None, **kwargs): from pandas.io.formats.format import Timedelta64Formatter return Timedelta64Formatter(values=self,