From 1a589cfdf1d2f79b4e86b4c204663620b4365b42 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 27 Nov 2018 16:29:20 -0800 Subject: [PATCH 01/10] Collect rendering, name, level, copy, constructor, and introspection methods --- pandas/core/indexes/base.py | 1142 ++++++++++++++++++----------------- 1 file changed, 584 insertions(+), 558 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 22c348acaf341..2d113e4c82176 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -244,6 +244,9 @@ def _outer_indexer(self, left, right): str = CachedAccessor("str", StringMethods) + # -------------------------------------------------------------------- + # Constructors + def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=None, tupleize_cols=True, **kwargs): @@ -518,6 +521,19 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): setattr(result, k, v) return result._reset_identity() + @cache_readonly + def _constructor(self): + return type(self) + + # -------------------------------------------------------------------- + # Copying Methods + + def _get_attributes_dict(self): + """ + Return an attributes dict for my class. + """ + return {k: getattr(self, k, None) for k in self._attributes} + _index_shared_docs['_shallow_copy'] = """ Create a new Index with the same class as the caller, don't copy the data, use the same object attributes with passed in attributes taking @@ -604,45 +620,60 @@ def _deepcopy_if_needed(self, orig, copy=False): return self - def _update_inplace(self, result, **kwargs): - # guard when called from IndexOpsMixin - raise TypeError("Index can't be updated inplace") - - def _sort_levels_monotonic(self): - """ - Compat with MultiIndex. - """ - return self - - _index_shared_docs['_get_grouper_for_level'] = """ - Get index grouper corresponding to an index level + _index_shared_docs['copy'] = """ + Make a copy of this object. Name and dtype sets those attributes on + the new object. Parameters ---------- - mapper: Group mapping function or None - Function mapping index values to groups - level : int or None - Index level + name : string, optional + deep : boolean, default False + dtype : numpy dtype or pandas type Returns ------- - grouper : Index - Index of values to group on - labels : ndarray of int or None - Array of locations in level_index - uniques : Index or None - Index of unique values for level + copy : Index + + Notes + ----- + In most cases, there should be no functional difference from using + ``deep``, but if ``deep`` is passed it will attempt to deepcopy. """ - @Appender(_index_shared_docs['_get_grouper_for_level']) - def _get_grouper_for_level(self, mapper, level=None): - assert level is None or level == 0 - if mapper is None: - grouper = self + @Appender(_index_shared_docs['copy']) + def copy(self, name=None, deep=False, dtype=None, **kwargs): + if deep: + new_index = self._shallow_copy(self._data.copy()) else: - grouper = self.map(mapper) + new_index = self._shallow_copy() - return grouper, None, None + names = kwargs.get('names') + names = self._validate_names(name=name, names=names, deep=deep) + new_index = new_index.set_names(names) + + if dtype: + new_index = new_index.astype(dtype) + return new_index + + def __copy__(self, **kwargs): + return self.copy(**kwargs) + + def __deepcopy__(self, memo=None): + """ + Parameters + ---------- + memo, default None + Standard signature. Unused + """ + if memo is None: + memo = {} + return self.copy(deep=True) + + # -------------------------------------------------------------------- + + def _update_inplace(self, result, **kwargs): + # guard when called from IndexOpsMixin + raise TypeError("Index can't be updated inplace") def is_(self, other): """ @@ -671,6 +702,9 @@ def _reset_identity(self): self._id = _Identity() return self + # -------------------------------------------------------------------- + # Array-Like Methods + # ndarray compat def __len__(self): """ @@ -695,6 +729,71 @@ def __array_wrap__(self, result, context=None): attrs = self._maybe_update_attributes(attrs) return Index(result, **attrs) + def ravel(self, order='C'): + """ + Return an ndarray of the flattened values of the underlying data. + + See Also + -------- + numpy.ndarray.ravel + """ + return self._ndarray_values.ravel(order=order) + + def view(self, cls=None): + + # we need to see if we are subclassing an + # index type here + if cls is not None and not hasattr(cls, '_typ'): + result = self._data.view(cls) + else: + result = self._shallow_copy() + if isinstance(result, Index): + result._id = self._id + return result + + _index_shared_docs['astype'] = """ + Create an Index with values cast to dtypes. The class of a new Index + is determined by dtype. When conversion is impossible, a ValueError + exception is raised. + + Parameters + ---------- + dtype : numpy dtype or pandas type + copy : bool, default True + By default, astype always returns a newly allocated object. + If copy is set to False and internal requirements on dtype are + satisfied, the original data is used to create a new Index + or the original Index is returned. + + .. versionadded:: 0.19.0 + """ + + @Appender(_index_shared_docs['astype']) + def astype(self, dtype, copy=True): + if is_dtype_equal(self.dtype, dtype): + return self.copy() if copy else self + + elif is_categorical_dtype(dtype): + from .category import CategoricalIndex + return CategoricalIndex(self.values, name=self.name, dtype=dtype, + copy=copy) + + elif is_extension_array_dtype(dtype): + return Index(np.asarray(self), dtype=dtype, copy=copy) + + try: + if is_datetime64tz_dtype(dtype): + from pandas import DatetimeIndex + return DatetimeIndex(self.values, name=self.name, dtype=dtype, + copy=copy) + return Index(self.values.astype(dtype, copy=copy), name=self.name, + dtype=dtype) + except (TypeError, ValueError): + msg = 'Cannot cast {name} to dtype {dtype}' + raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) + + # -------------------------------------------------------------------- + @cache_readonly def dtype(self): """ @@ -874,16 +973,6 @@ def where(self, cond, other=None): return self._shallow_copy_with_infer(values, dtype=dtype) - def ravel(self, order='C'): - """ - Return an ndarray of the flattened values of the underlying data. - - See Also - -------- - numpy.ndarray.ravel - """ - return self._ndarray_values.ravel(order=order) - # construction helpers @classmethod def _try_convert_to_int_index(cls, data, copy, name, dtype): @@ -963,24 +1052,6 @@ def _coerce_to_ndarray(cls, data): data = np.asarray(data) return data - def _get_attributes_dict(self): - """ - Return an attributes dict for my class. - """ - return {k: getattr(self, k, None) for k in self._attributes} - - def view(self, cls=None): - - # we need to see if we are subclassing an - # index type here - if cls is not None and not hasattr(cls, '_typ'): - result = self._data.view(cls) - else: - result = self._shallow_copy() - if isinstance(result, Index): - result._id = self._id - return result - def _coerce_scalar_to_index(self, item): """ We need to coerce a scalar to a compat for our index type. @@ -998,73 +1069,8 @@ def _coerce_scalar_to_index(self, item): return Index([item], dtype=dtype, **self._get_attributes_dict()) - _index_shared_docs['copy'] = """ - Make a copy of this object. Name and dtype sets those attributes on - the new object. - - Parameters - ---------- - name : string, optional - deep : boolean, default False - dtype : numpy dtype or pandas type - - Returns - ------- - copy : Index - - Notes - ----- - In most cases, there should be no functional difference from using - ``deep``, but if ``deep`` is passed it will attempt to deepcopy. - """ - - @Appender(_index_shared_docs['copy']) - def copy(self, name=None, deep=False, dtype=None, **kwargs): - if deep: - new_index = self._shallow_copy(self._data.copy()) - else: - new_index = self._shallow_copy() - - names = kwargs.get('names') - names = self._validate_names(name=name, names=names, deep=deep) - new_index = new_index.set_names(names) - - if dtype: - new_index = new_index.astype(dtype) - return new_index - - def __copy__(self, **kwargs): - return self.copy(**kwargs) - - def __deepcopy__(self, memo=None): - """ - Parameters - ---------- - memo, default None - Standard signature. Unused - """ - if memo is None: - memo = {} - return self.copy(deep=True) - - def _validate_names(self, name=None, names=None, deep=False): - """ - Handles the quirks of having a singular 'name' parameter for general - Index and plural 'names' parameter for MultiIndex. - """ - from copy import deepcopy - if names is not None and name is not None: - raise TypeError("Can only provide one of `names` and `name`") - elif names is None and name is None: - return deepcopy(self.names) if deep else self.names - elif names is not None: - if not is_list_like(names): - raise TypeError("Must pass list-like as `names`.") - return names - else: - if not is_list_like(name): - return [name] - return name + # -------------------------------------------------------------------- + # Rendering Methods def __unicode__(self): """ @@ -1125,191 +1131,111 @@ def _format_attrs(self): """ return format_object_attrs(self) - def to_flat_index(self): - """ - Identity method. - - .. versionadded:: 0.24.0 - - This is implemented for compatability with subclass implementations - when chaining. + def _mpl_repr(self): + # how to represent ourselves to matplotlib + return self.values - Returns - ------- - pd.Index - Caller. - - See Also - -------- - MultiIndex.to_flat_index : Subclass implementation. - """ - return self - - def to_series(self, index=None, name=None): - """ - Create a Series with both index and values equal to the index keys - useful with map for returning an indexer based on an index. - - Parameters - ---------- - index : Index, optional - index of resulting Series. If None, defaults to original index - name : string, optional - name of resulting Series. If None, defaults to name of original - index - - Returns - ------- - Series : dtype will be based on the type of the Index values. + def format(self, name=False, formatter=None, **kwargs): """ - - from pandas import Series - - if index is None: - index = self._shallow_copy() - if name is None: - name = self.name - - return Series(self.values.copy(), index=index, name=name) - - def to_frame(self, index=True, name=None): + Render a string representation of the Index. """ - Create a DataFrame with a column containing the Index. + header = [] + if name: + header.append(pprint_thing(self.name, + escape_chars=('\t', '\r', '\n')) if + self.name is not None else '') - .. versionadded:: 0.24.0 + if formatter is not None: + return header + list(self.map(formatter)) - Parameters - ---------- - index : boolean, default True - Set the index of the returned DataFrame as the original Index. + return self._format_with_header(header, **kwargs) - name : object, default None - The passed name should substitute for the index name (if it has - one). + def _format_with_header(self, header, na_rep='NaN', **kwargs): + values = self.values - Returns - ------- - DataFrame - DataFrame containing the original Index data. + from pandas.io.formats.format import format_array - See Also - -------- - Index.to_series : Convert an Index to a Series. - Series.to_frame : Convert Series to DataFrame. + if is_categorical_dtype(values.dtype): + values = np.array(values) - Examples - -------- - >>> idx = pd.Index(['Ant', 'Bear', 'Cow'], name='animal') - >>> idx.to_frame() - animal - animal - Ant Ant - Bear Bear - Cow Cow + elif is_object_dtype(values.dtype): + values = lib.maybe_convert_objects(values, safe=1) - By default, the original Index is reused. To enforce a new Index: + if is_object_dtype(values.dtype): + result = [pprint_thing(x, escape_chars=('\t', '\r', '\n')) + for x in values] - >>> idx.to_frame(index=False) - animal - 0 Ant - 1 Bear - 2 Cow + # could have nans + mask = isna(values) + if mask.any(): + result = np.array(result) + result[mask] = na_rep + result = result.tolist() - To override the name of the resulting column, specify `name`: + else: + result = _trim_front(format_array(values, None, justify='left')) + return header + result - >>> idx.to_frame(index=False, name='zoo') - zoo - 0 Ant - 1 Bear - 2 Cow + def to_native_types(self, slicer=None, **kwargs): """ - - from pandas import DataFrame - if name is None: - name = self.name or 0 - result = DataFrame({name: self.values.copy()}) - - if index: - result.index = self - return result - - _index_shared_docs['astype'] = """ - Create an Index with values cast to dtypes. The class of a new Index - is determined by dtype. When conversion is impossible, a ValueError - exception is raised. + Format specified values of `self` and return them. Parameters ---------- - dtype : numpy dtype or pandas type - copy : bool, default True - By default, astype always returns a newly allocated object. - If copy is set to False and internal requirements on dtype are - satisfied, the original data is used to create a new Index - or the original Index is returned. + slicer : int, array-like + An indexer into `self` that specifies which values + are used in the formatting process. + kwargs : dict + Options for specifying how the values should be formatted. + These options include the following: - .. versionadded:: 0.19.0 + 1) na_rep : str + The value that serves as a placeholder for NULL values + 2) quoting : bool or None + Whether or not there are quoted values in `self` + 3) date_format : str + The format used to represent date-like values """ - @Appender(_index_shared_docs['astype']) - def astype(self, dtype, copy=True): - if is_dtype_equal(self.dtype, dtype): - return self.copy() if copy else self - - elif is_categorical_dtype(dtype): - from .category import CategoricalIndex - return CategoricalIndex(self.values, name=self.name, dtype=dtype, - copy=copy) - - elif is_extension_array_dtype(dtype): - return Index(np.asarray(self), dtype=dtype, copy=copy) - - try: - if is_datetime64tz_dtype(dtype): - from pandas import DatetimeIndex - return DatetimeIndex(self.values, name=self.name, dtype=dtype, - copy=copy) - return Index(self.values.astype(dtype, copy=copy), name=self.name, - dtype=dtype) - except (TypeError, ValueError): - msg = 'Cannot cast {name} to dtype {dtype}' - raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) + values = self + if slicer is not None: + values = values[slicer] + return values._format_native_types(**kwargs) - def _to_safe_for_reshape(self): + def _format_native_types(self, na_rep='', quoting=None, **kwargs): """ - Convert to object if we are a categorical. + Actually format specific types of the index. """ - return self - - def _assert_can_do_setop(self, other): - if not is_list_like(other): - raise TypeError('Input must be Index or array-like') - return True - - def _convert_can_do_setop(self, other): - if not isinstance(other, Index): - other = Index(other, name=self.name) - result_name = self.name + mask = isna(self) + if not self.is_object() and not quoting: + values = np.asarray(self).astype(str) else: - result_name = get_op_result_name(self, other) - return other, result_name + values = np.array(self, dtype=object, copy=True) - def _convert_for_op(self, value): - """ - Convert value to be insertable to ndarray. - """ - return value + values[mask] = na_rep + return values - def _assert_can_do_op(self, value): + # -------------------------------------------------------------------- + # Name-Related Methods + + def _validate_names(self, name=None, names=None, deep=False): """ - Check value is valid for scalar op. + Handles the quirks of having a singular 'name' parameter for general + Index and plural 'names' parameter for MultiIndex. """ - if not is_scalar(value): - msg = "'value' must be a scalar, passed: {0}" - raise TypeError(msg.format(type(value).__name__)) - - @property - def nlevels(self): - return 1 + from copy import deepcopy + if names is not None and name is not None: + raise TypeError("Can only provide one of `names` and `name`") + elif names is None and name is None: + return deepcopy(self.names) if deep else self.names + elif names is not None: + if not is_list_like(names): + raise TypeError("Must pass list-like as `names`.") + return names + else: + if not is_list_like(name): + return [name] + return name def _get_names(self): return FrozenList((self.name, )) @@ -1468,60 +1394,193 @@ def rename(self, name, inplace=False): """ return self.set_names([name], inplace=inplace) + # -------------------------------------------------------------------- + # Level-Related Methods + @property - def _has_complex_internals(self): - # to disable groupby tricks in MultiIndex - return False + def nlevels(self): + return 1 - def _summary(self, name=None): + def _sort_levels_monotonic(self): """ - Return a summarized representation. + Compat with MultiIndex. + """ + return self + + _index_shared_docs['_get_grouper_for_level'] = """ + Get index grouper corresponding to an index level Parameters ---------- - name : str - name to use in the summary representation + mapper: Group mapping function or None + Function mapping index values to groups + level : int or None + Index level Returns ------- - String with a summarized representation of the index + grouper : Index + Index of values to group on + labels : ndarray of int or None + Array of locations in level_index + uniques : Index or None + Index of unique values for level """ - if len(self) > 0: - head = self[0] - if (hasattr(head, 'format') and - not isinstance(head, compat.string_types)): - head = head.format() - tail = self[-1] - if (hasattr(tail, 'format') and - not isinstance(tail, compat.string_types)): - tail = tail.format() - index_summary = ', %s to %s' % (pprint_thing(head), - pprint_thing(tail)) + + @Appender(_index_shared_docs['_get_grouper_for_level']) + def _get_grouper_for_level(self, mapper, level=None): + assert level is None or level == 0 + if mapper is None: + grouper = self else: - index_summary = '' + grouper = self.map(mapper) - if name is None: - name = type(self).__name__ - return '%s: %s entries%s' % (name, len(self), index_summary) + return grouper, None, None - def summary(self, name=None): + def _validate_index_level(self, level): """ - Return a summarized representation. + Validate index level. + + For single-level Index getting level number is a no-op, but some + verification must be done like in MultiIndex. - .. deprecated:: 0.23.0 """ - warnings.warn("'summary' is deprecated and will be removed in a " - "future version.", FutureWarning, stacklevel=2) - return self._summary(name) + if isinstance(level, int): + if level < 0 and level != -1: + raise IndexError("Too many levels: Index has only 1 level," + " %d is not a valid level number" % (level, )) + elif level > 0: + raise IndexError("Too many levels:" + " Index has only 1 level, not %d" % + (level + 1)) + elif level != self.name: + raise KeyError('Level %s must be same as name (%s)' % + (level, self.name)) - def _mpl_repr(self): - # how to represent ourselves to matplotlib - return self.values + def _get_level_number(self, level): + self._validate_index_level(level) + return 0 - _na_value = np.nan - """The expected NA value to use with this index.""" + def sortlevel(self, level=None, ascending=True, sort_remaining=None): + """ + For internal compatibility with with the Index API. + + Sort the Index. This is for compat with MultiIndex + + Parameters + ---------- + ascending : boolean, default True + False to sort in descending order + + level, sort_remaining are compat parameters + + Returns + ------- + sorted_index : Index + """ + return self.sort_values(return_indexer=True, ascending=ascending) + + def _get_level_values(self, level): + """ + Return an Index of values for requested level. + + This is primarily useful to get an individual level of values from a + MultiIndex, but is provided on Index as well for compatability. + + Parameters + ---------- + level : int or str + It is either the integer position or the name of the level. + + Returns + ------- + values : Index + Calling object, as there is only one level in the Index. + + See Also + -------- + MultiIndex.get_level_values : Get values for a level of a MultiIndex. + + Notes + ----- + For Index, level should be 0, since there are no multiple levels. + + Examples + -------- + + >>> idx = pd.Index(list('abc')) + >>> idx + Index(['a', 'b', 'c'], dtype='object') + + Get level values by supplying `level` as integer: + + >>> idx.get_level_values(0) + Index(['a', 'b', 'c'], dtype='object') + """ + self._validate_index_level(level) + return self + + get_level_values = _get_level_values + + def droplevel(self, level=0): + """ + Return index with requested level(s) removed. + + If resulting index has only 1 level left, the result will be + of Index type, not MultiIndex. + + .. versionadded:: 0.23.1 (support for non-MultiIndex) + + Parameters + ---------- + level : int, str, or list-like, default 0 + If a string is given, must be the name of a level + If list-like, elements must be names or indexes of levels. + + Returns + ------- + index : Index or MultiIndex + """ + if not isinstance(level, (tuple, list)): + level = [level] + + levnums = sorted(self._get_level_number(lev) for lev in level)[::-1] + + if len(level) == 0: + return self + if len(level) >= self.nlevels: + raise ValueError("Cannot remove {} levels from an index with {} " + "levels: at least one level must be " + "left.".format(len(level), self.nlevels)) + # The two checks above guarantee that here self is a MultiIndex + + new_levels = list(self.levels) + new_labels = list(self.labels) + new_names = list(self.names) + + for i in levnums: + new_levels.pop(i) + new_labels.pop(i) + new_names.pop(i) + + if len(new_levels) == 1: + + # set nan if needed + mask = new_labels[0] == -1 + result = new_levels[0].take(new_labels[0]) + if mask.any(): + result = result.putmask(mask, np.nan) + + result.name = new_names[0] + return result + else: + from .multi import MultiIndex + return MultiIndex(levels=new_levels, labels=new_labels, + names=new_names, verify_integrity=False) + + # -------------------------------------------------------------------- + # Introspection Methods - # introspection @property def is_monotonic(self): """ @@ -1650,26 +1709,218 @@ def is_categorical(self): >>> idx.is_categorical() False - >>> s = pd.Series(["Peter", "Victor", "Elisabeth", "Mar"]) - >>> s - 0 Peter - 1 Victor - 2 Elisabeth - 3 Mar - dtype: object - >>> s.index.is_categorical() - False - """ - return self.inferred_type in ['categorical'] + >>> s = pd.Series(["Peter", "Victor", "Elisabeth", "Mar"]) + >>> s + 0 Peter + 1 Victor + 2 Elisabeth + 3 Mar + dtype: object + >>> s.index.is_categorical() + False + """ + return self.inferred_type in ['categorical'] + + def is_interval(self): + return self.inferred_type in ['interval'] + + def is_mixed(self): + return self.inferred_type in ['mixed'] + + def holds_integer(self): + return self.inferred_type in ['integer', 'mixed-integer'] + + # -------------------------------------------------------------------- + + def to_flat_index(self): + """ + Identity method. + + .. versionadded:: 0.24.0 + + This is implemented for compatability with subclass implementations + when chaining. + + Returns + ------- + pd.Index + Caller. + + See Also + -------- + MultiIndex.to_flat_index : Subclass implementation. + """ + return self + + def to_series(self, index=None, name=None): + """ + Create a Series with both index and values equal to the index keys + useful with map for returning an indexer based on an index. + + Parameters + ---------- + index : Index, optional + index of resulting Series. If None, defaults to original index + name : string, optional + name of resulting Series. If None, defaults to name of original + index + + Returns + ------- + Series : dtype will be based on the type of the Index values. + """ + + from pandas import Series + + if index is None: + index = self._shallow_copy() + if name is None: + name = self.name + + return Series(self.values.copy(), index=index, name=name) + + def to_frame(self, index=True, name=None): + """ + Create a DataFrame with a column containing the Index. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + index : boolean, default True + Set the index of the returned DataFrame as the original Index. + + name : object, default None + The passed name should substitute for the index name (if it has + one). + + Returns + ------- + DataFrame + DataFrame containing the original Index data. + + See Also + -------- + Index.to_series : Convert an Index to a Series. + Series.to_frame : Convert Series to DataFrame. + + Examples + -------- + >>> idx = pd.Index(['Ant', 'Bear', 'Cow'], name='animal') + >>> idx.to_frame() + animal + animal + Ant Ant + Bear Bear + Cow Cow + + By default, the original Index is reused. To enforce a new Index: + + >>> idx.to_frame(index=False) + animal + 0 Ant + 1 Bear + 2 Cow + + To override the name of the resulting column, specify `name`: + + >>> idx.to_frame(index=False, name='zoo') + zoo + 0 Ant + 1 Bear + 2 Cow + """ + + from pandas import DataFrame + if name is None: + name = self.name or 0 + result = DataFrame({name: self.values.copy()}) + + if index: + result.index = self + return result + + def _to_safe_for_reshape(self): + """ + Convert to object if we are a categorical. + """ + return self + + def _assert_can_do_setop(self, other): + if not is_list_like(other): + raise TypeError('Input must be Index or array-like') + return True + + def _convert_can_do_setop(self, other): + if not isinstance(other, Index): + other = Index(other, name=self.name) + result_name = self.name + else: + result_name = get_op_result_name(self, other) + return other, result_name + + def _convert_for_op(self, value): + """ + Convert value to be insertable to ndarray. + """ + return value + + def _assert_can_do_op(self, value): + """ + Check value is valid for scalar op. + """ + if not is_scalar(value): + msg = "'value' must be a scalar, passed: {0}" + raise TypeError(msg.format(type(value).__name__)) + + @property + def _has_complex_internals(self): + # to disable groupby tricks in MultiIndex + return False + + def _summary(self, name=None): + """ + Return a summarized representation. + + Parameters + ---------- + name : str + name to use in the summary representation + + Returns + ------- + String with a summarized representation of the index + """ + if len(self) > 0: + head = self[0] + if (hasattr(head, 'format') and + not isinstance(head, compat.string_types)): + head = head.format() + tail = self[-1] + if (hasattr(tail, 'format') and + not isinstance(tail, compat.string_types)): + tail = tail.format() + index_summary = ', %s to %s' % (pprint_thing(head), + pprint_thing(tail)) + else: + index_summary = '' + + if name is None: + name = type(self).__name__ + return '%s: %s entries%s' % (name, len(self), index_summary) - def is_interval(self): - return self.inferred_type in ['interval'] + def summary(self, name=None): + """ + Return a summarized representation. - def is_mixed(self): - return self.inferred_type in ['mixed'] + .. deprecated:: 0.23.0 + """ + warnings.warn("'summary' is deprecated and will be removed in a " + "future version.", FutureWarning, stacklevel=2) + return self._summary(name) - def holds_integer(self): - return self.inferred_type in ['integer', 'mixed-integer'] + _na_value = np.nan + """The expected NA value to use with this index.""" _index_shared_docs['_convert_scalar_indexer'] = """ Convert a scalar indexer. @@ -1962,39 +2213,11 @@ def get_duplicates(self): def _cleanup(self): self._engine.clear_mapping() - @cache_readonly - def _constructor(self): - return type(self) - @cache_readonly def _engine(self): # property, for now, slow to look up return self._engine_type(lambda: self._ndarray_values, len(self)) - def _validate_index_level(self, level): - """ - Validate index level. - - For single-level Index getting level number is a no-op, but some - verification must be done like in MultiIndex. - - """ - if isinstance(level, int): - if level < 0 and level != -1: - raise IndexError("Too many levels: Index has only 1 level," - " %d is not a valid level number" % (level, )) - elif level > 0: - raise IndexError("Too many levels:" - " Index has only 1 level, not %d" % - (level + 1)) - elif level != self.name: - raise KeyError('Level %s must be same as name (%s)' % - (level, self.name)) - - def _get_level_number(self, level): - self._validate_index_level(level) - return 0 - @cache_readonly def inferred_type(self): """ @@ -2411,86 +2634,6 @@ def putmask(self, mask, value): # coerces to object return self.astype(object).putmask(mask, value) - def format(self, name=False, formatter=None, **kwargs): - """ - Render a string representation of the Index. - """ - header = [] - if name: - header.append(pprint_thing(self.name, - escape_chars=('\t', '\r', '\n')) if - self.name is not None else '') - - if formatter is not None: - return header + list(self.map(formatter)) - - return self._format_with_header(header, **kwargs) - - def _format_with_header(self, header, na_rep='NaN', **kwargs): - values = self.values - - from pandas.io.formats.format import format_array - - if is_categorical_dtype(values.dtype): - values = np.array(values) - - elif is_object_dtype(values.dtype): - values = lib.maybe_convert_objects(values, safe=1) - - if is_object_dtype(values.dtype): - result = [pprint_thing(x, escape_chars=('\t', '\r', '\n')) - for x in values] - - # could have nans - mask = isna(values) - if mask.any(): - result = np.array(result) - result[mask] = na_rep - result = result.tolist() - - else: - result = _trim_front(format_array(values, None, justify='left')) - return header + result - - def to_native_types(self, slicer=None, **kwargs): - """ - Format specified values of `self` and return them. - - Parameters - ---------- - slicer : int, array-like - An indexer into `self` that specifies which values - are used in the formatting process. - kwargs : dict - Options for specifying how the values should be formatted. - These options include the following: - - 1) na_rep : str - The value that serves as a placeholder for NULL values - 2) quoting : bool or None - Whether or not there are quoted values in `self` - 3) date_format : str - The format used to represent date-like values - """ - - values = self - if slicer is not None: - values = values[slicer] - return values._format_native_types(**kwargs) - - def _format_native_types(self, na_rep='', quoting=None, **kwargs): - """ - Actually format specific types of the index. - """ - mask = isna(self) - if not self.is_object() and not quoting: - values = np.asarray(self).astype(str) - else: - values = np.array(self, dtype=object, copy=True) - - values[mask] = na_rep - return values - def equals(self, other): """ Determines if two Index objects contain the same elements. @@ -2683,25 +2826,6 @@ def sort(self, *args, **kwargs): raise TypeError("cannot sort an Index object in-place, use " "sort_values instead") - def sortlevel(self, level=None, ascending=True, sort_remaining=None): - """ - For internal compatibility with with the Index API. - - Sort the Index. This is for compat with MultiIndex - - Parameters - ---------- - ascending : boolean, default True - False to sort in descending order - - level, sort_remaining are compat parameters - - Returns - ------- - sorted_index : Index - """ - return self.sort_values(return_indexer=True, ascending=ascending) - def shift(self, periods=1, freq=None): """ Shift index by desired number of time frequency increments. @@ -3275,104 +3399,6 @@ def set_value(self, arr, key, value): self._engine.set_value(com.values_from_object(arr), com.values_from_object(key), value) - def _get_level_values(self, level): - """ - Return an Index of values for requested level. - - This is primarily useful to get an individual level of values from a - MultiIndex, but is provided on Index as well for compatability. - - Parameters - ---------- - level : int or str - It is either the integer position or the name of the level. - - Returns - ------- - values : Index - Calling object, as there is only one level in the Index. - - See Also - -------- - MultiIndex.get_level_values : Get values for a level of a MultiIndex. - - Notes - ----- - For Index, level should be 0, since there are no multiple levels. - - Examples - -------- - - >>> idx = pd.Index(list('abc')) - >>> idx - Index(['a', 'b', 'c'], dtype='object') - - Get level values by supplying `level` as integer: - - >>> idx.get_level_values(0) - Index(['a', 'b', 'c'], dtype='object') - """ - self._validate_index_level(level) - return self - - get_level_values = _get_level_values - - def droplevel(self, level=0): - """ - Return index with requested level(s) removed. - - If resulting index has only 1 level left, the result will be - of Index type, not MultiIndex. - - .. versionadded:: 0.23.1 (support for non-MultiIndex) - - Parameters - ---------- - level : int, str, or list-like, default 0 - If a string is given, must be the name of a level - If list-like, elements must be names or indexes of levels. - - Returns - ------- - index : Index or MultiIndex - """ - if not isinstance(level, (tuple, list)): - level = [level] - - levnums = sorted(self._get_level_number(lev) for lev in level)[::-1] - - if len(level) == 0: - return self - if len(level) >= self.nlevels: - raise ValueError("Cannot remove {} levels from an index with {} " - "levels: at least one level must be " - "left.".format(len(level), self.nlevels)) - # The two checks above guarantee that here self is a MultiIndex - - new_levels = list(self.levels) - new_labels = list(self.labels) - new_names = list(self.names) - - for i in levnums: - new_levels.pop(i) - new_labels.pop(i) - new_names.pop(i) - - if len(new_levels) == 1: - - # set nan if needed - mask = new_labels[0] == -1 - result = new_levels[0].take(new_labels[0]) - if mask.any(): - result = result.putmask(mask, np.nan) - - result.name = new_names[0] - return result - else: - from .multi import MultiIndex - return MultiIndex(levels=new_levels, labels=new_labels, - names=new_names, verify_integrity=False) - _index_shared_docs['get_indexer'] = """ Compute indexer and mask for new index given the current index. The indexer should be then used as an input to ndarray.take to align the From c08e631a600a17b57ed46bf0084f0c08549d6fd1 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 27 Nov 2018 16:39:42 -0800 Subject: [PATCH 02/10] Collect Index methdos --- pandas/core/indexes/base.py | 251 +++++++++++++++++++----------------- 1 file changed, 134 insertions(+), 117 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2d113e4c82176..a4a47b47e8e6a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -670,6 +670,15 @@ def __deepcopy__(self, memo=None): return self.copy(deep=True) # -------------------------------------------------------------------- + # Engine/Identity Methods + + @cache_readonly + def _engine(self): + # property, for now, slow to look up + return self._engine_type(lambda: self._ndarray_values, len(self)) + + def _cleanup(self): + self._engine.clear_mapping() def _update_inplace(self, result, **kwargs): # guard when called from IndexOpsMixin @@ -792,8 +801,6 @@ def astype(self, dtype, copy=True): msg = 'Cannot cast {name} to dtype {dtype}' raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) - # -------------------------------------------------------------------- - @cache_readonly def dtype(self): """ @@ -808,6 +815,68 @@ def dtype_str(self): """ return str(self.dtype) + _index_shared_docs['take'] = """ + Return a new %(klass)s of the values selected by the indices. + + For internal compatibility with numpy arrays. + + Parameters + ---------- + indices : list + Indices to be taken + axis : int, optional + The axis over which to select values, always 0. + allow_fill : bool, default True + fill_value : bool, default None + If allow_fill=True and fill_value is not None, indices specified by + -1 is regarded as NA. If Index doesn't hold NA, raise ValueError + + See Also + -------- + numpy.ndarray.take + """ + + @Appender(_index_shared_docs['take'] % _index_doc_kwargs) + def take(self, indices, axis=0, allow_fill=True, + fill_value=None, **kwargs): + if kwargs: + nv.validate_take(tuple(), kwargs) + indices = ensure_platform_int(indices) + if self._can_hold_na: + taken = self._assert_take_fillable(self.values, indices, + allow_fill=allow_fill, + fill_value=fill_value, + na_value=self._na_value) + else: + if allow_fill and fill_value is not None: + msg = 'Unable to fill values because {0} cannot contain NA' + raise ValueError(msg.format(self.__class__.__name__)) + taken = self.values.take(indices) + return self._shallow_copy(taken) + + def _assert_take_fillable(self, values, indices, allow_fill=True, + fill_value=None, na_value=np.nan): + """ + Internal method to handle NA filling of take. + """ + indices = ensure_platform_int(indices) + + # only fill if we are passing a non-None fill_value + if allow_fill and fill_value is not None: + if (indices < -1).any(): + msg = ('When allow_fill=True and fill_value is not None, ' + 'all indices must be >= -1') + raise ValueError(msg) + taken = algos.take(values, + indices, + allow_fill=allow_fill, + fill_value=na_value) + else: + taken = values.take(indices) + return taken + + # -------------------------------------------------------------------- + @property def values(self): """ @@ -1730,7 +1799,21 @@ def is_mixed(self): def holds_integer(self): return self.inferred_type in ['integer', 'mixed-integer'] + @cache_readonly + def inferred_type(self): + """ + Return a string of the type inferred from the values. + """ + return lib.infer_dtype(self) + + @cache_readonly + def is_all_dates(self): + if self._data is None: + return False + return is_datetime_array(ensure_object(self.values)) + # -------------------------------------------------------------------- + # Conversion Methods def to_flat_index(self): """ @@ -1840,6 +1923,45 @@ def to_frame(self, index=True, name=None): result.index = self return result + # -------------------------------------------------------------------- + # Pickle Methods + + def __reduce__(self): + d = dict(data=self._data) + d.update(self._get_attributes_dict()) + return _new_Index, (self.__class__, d), None + + def __setstate__(self, state): + """ + Necessary for making this object picklable. + """ + + if isinstance(state, dict): + self._data = state.pop('data') + for k, v in compat.iteritems(state): + setattr(self, k, v) + + elif isinstance(state, tuple): + + if len(state) == 2: + nd_state, own_state = state + data = np.empty(nd_state[1], dtype=nd_state[2]) + np.ndarray.__setstate__(data, nd_state) + self.name = own_state[0] + + else: # pragma: no cover + data = np.empty(state) + np.ndarray.__setstate__(data, state) + + self._data = data + self._reset_identity() + else: + raise Exception("invalid pickle state") + + _unpickle_compat = __setstate__ + + # -------------------------------------------------------------------- + def _to_safe_for_reshape(self): """ Convert to object if we are a categorical. @@ -2210,21 +2332,6 @@ def get_duplicates(self): return self[self.duplicated()].unique() - def _cleanup(self): - self._engine.clear_mapping() - - @cache_readonly - def _engine(self): - # property, for now, slow to look up - return self._engine_type(lambda: self._ndarray_values, len(self)) - - @cache_readonly - def inferred_type(self): - """ - Return a string of the type inferred from the values. - """ - return lib.infer_dtype(self) - def _is_memory_usage_qualified(self): """ Return a boolean if we need a qualified .info display. @@ -2234,46 +2341,6 @@ def _is_memory_usage_qualified(self): def is_type_compatible(self, kind): return kind == self.inferred_type - @cache_readonly - def is_all_dates(self): - if self._data is None: - return False - return is_datetime_array(ensure_object(self.values)) - - def __reduce__(self): - d = dict(data=self._data) - d.update(self._get_attributes_dict()) - return _new_Index, (self.__class__, d), None - - def __setstate__(self, state): - """ - Necessary for making this object picklable. - """ - - if isinstance(state, dict): - self._data = state.pop('data') - for k, v in compat.iteritems(state): - setattr(self, k, v) - - elif isinstance(state, tuple): - - if len(state) == 2: - nd_state, own_state = state - data = np.empty(nd_state[1], dtype=nd_state[2]) - np.ndarray.__setstate__(data, nd_state) - self.name = own_state[0] - - else: # pragma: no cover - data = np.empty(state) - np.ndarray.__setstate__(data, state) - - self._data = data - self._reset_identity() - else: - raise Exception("invalid pickle state") - - _unpickle_compat = __setstate__ - def __nonzero__(self): raise ValueError("The truth value of a {0} is ambiguous. " "Use a.empty, a.bool(), a.item(), a.any() or a.all()." @@ -2419,66 +2486,6 @@ def _concat_same_dtype(self, to_concat, name): # must be overridden in specific classes return _concat._concat_index_asobject(to_concat, name) - _index_shared_docs['take'] = """ - Return a new %(klass)s of the values selected by the indices. - - For internal compatibility with numpy arrays. - - Parameters - ---------- - indices : list - Indices to be taken - axis : int, optional - The axis over which to select values, always 0. - allow_fill : bool, default True - fill_value : bool, default None - If allow_fill=True and fill_value is not None, indices specified by - -1 is regarded as NA. If Index doesn't hold NA, raise ValueError - - See Also - -------- - numpy.ndarray.take - """ - - @Appender(_index_shared_docs['take'] % _index_doc_kwargs) - def take(self, indices, axis=0, allow_fill=True, - fill_value=None, **kwargs): - if kwargs: - nv.validate_take(tuple(), kwargs) - indices = ensure_platform_int(indices) - if self._can_hold_na: - taken = self._assert_take_fillable(self.values, indices, - allow_fill=allow_fill, - fill_value=fill_value, - na_value=self._na_value) - else: - if allow_fill and fill_value is not None: - msg = 'Unable to fill values because {0} cannot contain NA' - raise ValueError(msg.format(self.__class__.__name__)) - taken = self.values.take(indices) - return self._shallow_copy(taken) - - def _assert_take_fillable(self, values, indices, allow_fill=True, - fill_value=None, na_value=np.nan): - """ - Internal method to handle NA filling of take. - """ - indices = ensure_platform_int(indices) - - # only fill if we are passing a non-None fill_value - if allow_fill and fill_value is not None: - if (indices < -1).any(): - msg = ('When allow_fill=True and fill_value is not None, ' - 'all indices must be >= -1') - raise ValueError(msg) - taken = algos.take(values, - indices, - allow_fill=allow_fill, - fill_value=na_value) - else: - taken = values.take(indices) - return taken - @cache_readonly def _isnan(self): """ @@ -2952,6 +2959,9 @@ def __or__(self, other): def __xor__(self, other): return self.symmetric_difference(other) + # -------------------------------------------------------------------- + # Set Operation Methods + def _get_reconciled_name_object(self, other): """ If the result of a set operation will be self, @@ -3241,6 +3251,8 @@ def symmetric_difference(self, other, result_name=None): attribs['freq'] = None return self._shallow_copy_with_infer(the_diff, **attribs) + # -------------------------------------------------------------------- + def _get_unique_index(self, dropna=False): """ Returns an index containing unique values. @@ -3923,6 +3935,9 @@ def _reindex_non_unique(self, target): new_index = self._shallow_copy_with_infer(new_labels, freq=None) return new_index, indexer, new_indexer + # -------------------------------------------------------------------- + # Join Methods + _index_shared_docs['join'] = """ Compute join_index and indexers to conform data structures to the new index. @@ -4315,6 +4330,8 @@ def _wrap_joined_index(self, joined, other): name = get_op_result_name(self, other) return Index(joined, name=name) + # -------------------------------------------------------------------- + def _get_string_slice(self, key, use_lhs=True, use_rhs=True): # this is for partial string indexing, # overridden in DatetimeIndex, TimedeltaIndex and PeriodIndex From 888d4cca625775f2962fe19d72bdc5212e65829e Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 27 Nov 2018 16:48:17 -0800 Subject: [PATCH 03/10] collect more --- pandas/core/indexes/base.py | 695 ++++++++++++++++++------------------ 1 file changed, 351 insertions(+), 344 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a4a47b47e8e6a..fbb78dd5a7343 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -525,6 +525,87 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): def _constructor(self): return type(self) + # -------------------------------------------------------------------- + # Construction helpers + + @classmethod + def _try_convert_to_int_index(cls, data, copy, name, dtype): + """ + Attempt to convert an array of data into an integer index. + + Parameters + ---------- + data : The data to convert. + copy : Whether to copy the data or not. + name : The name of the index returned. + + Returns + ------- + int_index : data converted to either an Int64Index or a + UInt64Index + + Raises + ------ + ValueError if the conversion was not successful. + """ + + from .numeric import Int64Index, UInt64Index + if not is_unsigned_integer_dtype(dtype): + # skip int64 conversion attempt if uint-like dtype is passed, as + # this could return Int64Index when UInt64Index is what's desrired + try: + res = data.astype('i8', copy=False) + if (res == data).all(): + return Int64Index(res, copy=copy, name=name) + except (OverflowError, TypeError, ValueError): + pass + + # Conversion to int64 failed (possibly due to overflow) or was skipped, + # so let's try now with uint64. + try: + res = data.astype('u8', copy=False) + if (res == data).all(): + return UInt64Index(res, copy=copy, name=name) + except (OverflowError, TypeError, ValueError): + pass + + raise ValueError + + @classmethod + def _scalar_data_error(cls, data): + raise TypeError('{0}(...) must be called with a collection of some ' + 'kind, {1} was passed'.format(cls.__name__, + repr(data))) + + @classmethod + def _string_data_error(cls, data): + raise TypeError('String dtype not supported, you may need ' + 'to explicitly cast to a numeric type') + + @classmethod + def _coerce_to_ndarray(cls, data): + """ + Coerces data to ndarray. + + Converts other iterables to list first and then to array. + Does not touch ndarrays. + + Raises + ------ + TypeError + When the data passed in is a scalar. + """ + + if not isinstance(data, (np.ndarray, Index)): + if data is None or is_scalar(data): + cls._scalar_data_error(data) + + # other iterable of some kind + if not isinstance(data, (ABCSeries, list, tuple)): + data = list(data) + data = np.asarray(data) + return data + # -------------------------------------------------------------------- # Copying Methods @@ -669,6 +750,55 @@ def __deepcopy__(self, memo=None): memo = {} return self.copy(deep=True) + # -------------------------------------------------------------------- + # PandasObject/IndexOpsMixin Compat + + @property + def values(self): + """ + Return the underlying data as an ndarray. + """ + return self._data.view(np.ndarray) + + @property + def _values(self): + # type: () -> Union[ExtensionArray, Index, np.ndarray] + # TODO(EA): remove index types as they become extension arrays + """ + The best array representation. + + This is an ndarray, ExtensionArray, or Index subclass. This differs + from ``_ndarray_values``, which always returns an ndarray. + + Both ``_values`` and ``_ndarray_values`` are consistent between + ``Series`` and ``Index``. + + It may differ from the public '.values' method. + + index | values | _values | _ndarray_values | + ----------------- | --------------- | ------------- | --------------- | + Index | ndarray | ndarray | ndarray | + CategoricalIndex | Categorical | Categorical | ndarray[int] | + DatetimeIndex | ndarray[M8ns] | ndarray[M8ns] | ndarray[M8ns] | + DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] | ndarray[M8ns] | + PeriodIndex | ndarray[object] | PeriodArray | ndarray[int] | + IntervalIndex | IntervalArray | IntervalArray | ndarray[object] | + + See Also + -------- + values + _ndarray_values + """ + return self.values + + @Appender(IndexOpsMixin.memory_usage.__doc__) + def memory_usage(self, deep=False): + result = super(Index, self).memory_usage(deep=deep) + + # include our engine hashtable + result += self._engine.sizeof(deep=deep) + return result + # -------------------------------------------------------------------- # Engine/Identity Methods @@ -680,10 +810,6 @@ def _engine(self): def _cleanup(self): self._engine.clear_mapping() - def _update_inplace(self, result, **kwargs): - # guard when called from IndexOpsMixin - raise TypeError("Index can't be updated inplace") - def is_(self, other): """ More flexible, faster check like ``is`` but that works through views. @@ -877,44 +1003,6 @@ def _assert_take_fillable(self, values, indices, allow_fill=True, # -------------------------------------------------------------------- - @property - def values(self): - """ - Return the underlying data as an ndarray. - """ - return self._data.view(np.ndarray) - - @property - def _values(self): - # type: () -> Union[ExtensionArray, Index, np.ndarray] - # TODO(EA): remove index types as they become extension arrays - """ - The best array representation. - - This is an ndarray, ExtensionArray, or Index subclass. This differs - from ``_ndarray_values``, which always returns an ndarray. - - Both ``_values`` and ``_ndarray_values`` are consistent between - ``Series`` and ``Index``. - - It may differ from the public '.values' method. - - index | values | _values | _ndarray_values | - ----------------- | --------------- | ------------- | --------------- | - Index | ndarray | ndarray | ndarray | - CategoricalIndex | Categorical | Categorical | ndarray[int] | - DatetimeIndex | ndarray[M8ns] | ndarray[M8ns] | ndarray[M8ns] | - DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] | ndarray[M8ns] | - PeriodIndex | ndarray[object] | PeriodArray | ndarray[int] | - IntervalIndex | IntervalArray | IntervalArray | ndarray[object] | - - See Also - -------- - values - _ndarray_values - """ - return self.values - def get_values(self): """ Return `Index` data as an `numpy.ndarray`. @@ -959,13 +1047,9 @@ def get_values(self): """ return self.values - @Appender(IndexOpsMixin.memory_usage.__doc__) - def memory_usage(self, deep=False): - result = super(Index, self).memory_usage(deep=deep) - - # include our engine hashtable - result += self._engine.sizeof(deep=deep) - return result + def _update_inplace(self, result, **kwargs): + # guard when called from IndexOpsMixin + raise TypeError("Index can't be updated inplace") # ops compat def repeat(self, repeats, *args, **kwargs): @@ -1042,85 +1126,6 @@ def where(self, cond, other=None): return self._shallow_copy_with_infer(values, dtype=dtype) - # construction helpers - @classmethod - def _try_convert_to_int_index(cls, data, copy, name, dtype): - """ - Attempt to convert an array of data into an integer index. - - Parameters - ---------- - data : The data to convert. - copy : Whether to copy the data or not. - name : The name of the index returned. - - Returns - ------- - int_index : data converted to either an Int64Index or a - UInt64Index - - Raises - ------ - ValueError if the conversion was not successful. - """ - - from .numeric import Int64Index, UInt64Index - if not is_unsigned_integer_dtype(dtype): - # skip int64 conversion attempt if uint-like dtype is passed, as - # this could return Int64Index when UInt64Index is what's desrired - try: - res = data.astype('i8', copy=False) - if (res == data).all(): - return Int64Index(res, copy=copy, name=name) - except (OverflowError, TypeError, ValueError): - pass - - # Conversion to int64 failed (possibly due to overflow) or was skipped, - # so let's try now with uint64. - try: - res = data.astype('u8', copy=False) - if (res == data).all(): - return UInt64Index(res, copy=copy, name=name) - except (OverflowError, TypeError, ValueError): - pass - - raise ValueError - - @classmethod - def _scalar_data_error(cls, data): - raise TypeError('{0}(...) must be called with a collection of some ' - 'kind, {1} was passed'.format(cls.__name__, - repr(data))) - - @classmethod - def _string_data_error(cls, data): - raise TypeError('String dtype not supported, you may need ' - 'to explicitly cast to a numeric type') - - @classmethod - def _coerce_to_ndarray(cls, data): - """ - Coerces data to ndarray. - - Converts other iterables to list first and then to array. - Does not touch ndarrays. - - Raises - ------ - TypeError - When the data passed in is a scalar. - """ - - if not isinstance(data, (np.ndarray, Index)): - if data is None or is_scalar(data): - cls._scalar_data_error(data) - - # other iterable of some kind - if not isinstance(data, (ABCSeries, list, tuple)): - data = list(data) - data = np.asarray(data) - return data - def _coerce_scalar_to_index(self, item): """ We need to coerce a scalar to a compat for our index type. @@ -1926,39 +1931,232 @@ def to_frame(self, index=True, name=None): # -------------------------------------------------------------------- # Pickle Methods - def __reduce__(self): - d = dict(data=self._data) - d.update(self._get_attributes_dict()) - return _new_Index, (self.__class__, d), None + def __reduce__(self): + d = dict(data=self._data) + d.update(self._get_attributes_dict()) + return _new_Index, (self.__class__, d), None + + def __setstate__(self, state): + """ + Necessary for making this object picklable. + """ + + if isinstance(state, dict): + self._data = state.pop('data') + for k, v in compat.iteritems(state): + setattr(self, k, v) + + elif isinstance(state, tuple): + + if len(state) == 2: + nd_state, own_state = state + data = np.empty(nd_state[1], dtype=nd_state[2]) + np.ndarray.__setstate__(data, nd_state) + self.name = own_state[0] + + else: # pragma: no cover + data = np.empty(state) + np.ndarray.__setstate__(data, state) + + self._data = data + self._reset_identity() + else: + raise Exception("invalid pickle state") + + _unpickle_compat = __setstate__ + + # -------------------------------------------------------------------- + # Null-Handling Methods + _na_value = np.nan + """The expected NA value to use with this index.""" + + @cache_readonly + def _isnan(self): + """ + Return if each value is NaN. + """ + if self._can_hold_na: + return isna(self) + else: + # shouldn't reach to this condition by checking hasnans beforehand + values = np.empty(len(self), dtype=np.bool_) + values.fill(False) + return values + + @cache_readonly + def _nan_idxs(self): + if self._can_hold_na: + w, = self._isnan.nonzero() + return w + else: + return np.array([], dtype=np.int64) + + @cache_readonly + def hasnans(self): + """ + Return if I have any nans; enables various perf speedups. + """ + if self._can_hold_na: + return bool(self._isnan.any()) + else: + return False + + def isna(self): + """ + Detect missing values. + + Return a boolean same-sized object indicating if the values are NA. + NA values, such as ``None``, :attr:`numpy.NaN` or :attr:`pd.NaT`, get + mapped to ``True`` values. + Everything else get mapped to ``False`` values. Characters such as + empty strings `''` or :attr:`numpy.inf` are not considered NA values + (unless you set ``pandas.options.mode.use_inf_as_na = True``). + + .. versionadded:: 0.20.0 + + Returns + ------- + numpy.ndarray + A boolean array of whether my values are NA + + See Also + -------- + pandas.Index.notna : Boolean inverse of isna. + pandas.Index.dropna : Omit entries with missing values. + pandas.isna : Top-level isna. + Series.isna : Detect missing values in Series object. + + Examples + -------- + Show which entries in a pandas.Index are NA. The result is an + array. + + >>> idx = pd.Index([5.2, 6.0, np.NaN]) + >>> idx + Float64Index([5.2, 6.0, nan], dtype='float64') + >>> idx.isna() + array([False, False, True], dtype=bool) + + Empty strings are not considered NA values. None is considered an NA + value. + + >>> idx = pd.Index(['black', '', 'red', None]) + >>> idx + Index(['black', '', 'red', None], dtype='object') + >>> idx.isna() + array([False, False, False, True], dtype=bool) + + For datetimes, `NaT` (Not a Time) is considered as an NA value. + + >>> idx = pd.DatetimeIndex([pd.Timestamp('1940-04-25'), + ... pd.Timestamp(''), None, pd.NaT]) + >>> idx + DatetimeIndex(['1940-04-25', 'NaT', 'NaT', 'NaT'], + dtype='datetime64[ns]', freq=None) + >>> idx.isna() + array([False, True, True, True], dtype=bool) + """ + return self._isnan + isnull = isna + + def notna(self): + """ + Detect existing (non-missing) values. + + Return a boolean same-sized object indicating if the values are not NA. + Non-missing values get mapped to ``True``. Characters such as empty + strings ``''`` or :attr:`numpy.inf` are not considered NA values + (unless you set ``pandas.options.mode.use_inf_as_na = True``). + NA values, such as None or :attr:`numpy.NaN`, get mapped to ``False`` + values. + + .. versionadded:: 0.20.0 + + Returns + ------- + numpy.ndarray + Boolean array to indicate which entries are not NA. + + See Also + -------- + Index.notnull : Alias of notna. + Index.isna: Inverse of notna. + pandas.notna : Top-level notna. + + Examples + -------- + Show which entries in an Index are not NA. The result is an + array. + + >>> idx = pd.Index([5.2, 6.0, np.NaN]) + >>> idx + Float64Index([5.2, 6.0, nan], dtype='float64') + >>> idx.notna() + array([ True, True, False]) + + Empty strings are not considered NA values. None is considered a NA + value. + + >>> idx = pd.Index(['black', '', 'red', None]) + >>> idx + Index(['black', '', 'red', None], dtype='object') + >>> idx.notna() + array([ True, True, True, False]) + """ + return ~self.isna() + notnull = notna + + _index_shared_docs['fillna'] = """ + Fill NA/NaN values with the specified value + + Parameters + ---------- + value : scalar + Scalar value to use to fill holes (e.g. 0). + This value cannot be a list-likes. + downcast : dict, default is None + a dict of item->dtype of what to downcast if possible, + or the string 'infer' which will try to downcast to an appropriate + equal type (e.g. float64 to int64 if possible) - def __setstate__(self, state): - """ - Necessary for making this object picklable. + Returns + ------- + filled : %(klass)s """ - if isinstance(state, dict): - self._data = state.pop('data') - for k, v in compat.iteritems(state): - setattr(self, k, v) + @Appender(_index_shared_docs['fillna']) + def fillna(self, value=None, downcast=None): + self._assert_can_do_op(value) + if self.hasnans: + result = self.putmask(self._isnan, value) + if downcast is None: + # no need to care metadata other than name + # because it can't have freq if + return Index(result, name=self.name) + return self._shallow_copy() - elif isinstance(state, tuple): + _index_shared_docs['dropna'] = """ + Return Index without NA/NaN values - if len(state) == 2: - nd_state, own_state = state - data = np.empty(nd_state[1], dtype=nd_state[2]) - np.ndarray.__setstate__(data, nd_state) - self.name = own_state[0] + Parameters + ---------- + how : {'any', 'all'}, default 'any' + If the Index is a MultiIndex, drop the value when any or all levels + are NaN. - else: # pragma: no cover - data = np.empty(state) - np.ndarray.__setstate__(data, state) + Returns + ------- + valid : Index + """ - self._data = data - self._reset_identity() - else: - raise Exception("invalid pickle state") + @Appender(_index_shared_docs['dropna']) + def dropna(self, how='any'): + if how not in ('any', 'all'): + raise ValueError("invalid how option: {0}".format(how)) - _unpickle_compat = __setstate__ + if self.hasnans: + return self._shallow_copy(self.values[~self._isnan]) + return self._shallow_copy() # -------------------------------------------------------------------- @@ -2041,9 +2239,6 @@ def summary(self, name=None): "future version.", FutureWarning, stacklevel=2) return self._summary(name) - _na_value = np.nan - """The expected NA value to use with this index.""" - _index_shared_docs['_convert_scalar_indexer'] = """ Convert a scalar indexer. @@ -2486,142 +2681,6 @@ def _concat_same_dtype(self, to_concat, name): # must be overridden in specific classes return _concat._concat_index_asobject(to_concat, name) - @cache_readonly - def _isnan(self): - """ - Return if each value is NaN. - """ - if self._can_hold_na: - return isna(self) - else: - # shouldn't reach to this condition by checking hasnans beforehand - values = np.empty(len(self), dtype=np.bool_) - values.fill(False) - return values - - @cache_readonly - def _nan_idxs(self): - if self._can_hold_na: - w, = self._isnan.nonzero() - return w - else: - return np.array([], dtype=np.int64) - - @cache_readonly - def hasnans(self): - """ - Return if I have any nans; enables various perf speedups. - """ - if self._can_hold_na: - return bool(self._isnan.any()) - else: - return False - - def isna(self): - """ - Detect missing values. - - Return a boolean same-sized object indicating if the values are NA. - NA values, such as ``None``, :attr:`numpy.NaN` or :attr:`pd.NaT`, get - mapped to ``True`` values. - Everything else get mapped to ``False`` values. Characters such as - empty strings `''` or :attr:`numpy.inf` are not considered NA values - (unless you set ``pandas.options.mode.use_inf_as_na = True``). - - .. versionadded:: 0.20.0 - - Returns - ------- - numpy.ndarray - A boolean array of whether my values are NA - - See Also - -------- - pandas.Index.notna : Boolean inverse of isna. - pandas.Index.dropna : Omit entries with missing values. - pandas.isna : Top-level isna. - Series.isna : Detect missing values in Series object. - - Examples - -------- - Show which entries in a pandas.Index are NA. The result is an - array. - - >>> idx = pd.Index([5.2, 6.0, np.NaN]) - >>> idx - Float64Index([5.2, 6.0, nan], dtype='float64') - >>> idx.isna() - array([False, False, True], dtype=bool) - - Empty strings are not considered NA values. None is considered an NA - value. - - >>> idx = pd.Index(['black', '', 'red', None]) - >>> idx - Index(['black', '', 'red', None], dtype='object') - >>> idx.isna() - array([False, False, False, True], dtype=bool) - - For datetimes, `NaT` (Not a Time) is considered as an NA value. - - >>> idx = pd.DatetimeIndex([pd.Timestamp('1940-04-25'), - ... pd.Timestamp(''), None, pd.NaT]) - >>> idx - DatetimeIndex(['1940-04-25', 'NaT', 'NaT', 'NaT'], - dtype='datetime64[ns]', freq=None) - >>> idx.isna() - array([False, True, True, True], dtype=bool) - """ - return self._isnan - isnull = isna - - def notna(self): - """ - Detect existing (non-missing) values. - - Return a boolean same-sized object indicating if the values are not NA. - Non-missing values get mapped to ``True``. Characters such as empty - strings ``''`` or :attr:`numpy.inf` are not considered NA values - (unless you set ``pandas.options.mode.use_inf_as_na = True``). - NA values, such as None or :attr:`numpy.NaN`, get mapped to ``False`` - values. - - .. versionadded:: 0.20.0 - - Returns - ------- - numpy.ndarray - Boolean array to indicate which entries are not NA. - - See Also - -------- - Index.notnull : Alias of notna. - Index.isna: Inverse of notna. - pandas.notna : Top-level notna. - - Examples - -------- - Show which entries in an Index are not NA. The result is an - array. - - >>> idx = pd.Index([5.2, 6.0, np.NaN]) - >>> idx - Float64Index([5.2, 6.0, nan], dtype='float64') - >>> idx.notna() - array([ True, True, False]) - - Empty strings are not considered NA values. None is considered a NA - value. - - >>> idx = pd.Index(['black', '', 'red', None]) - >>> idx - Index(['black', '', 'red', None], dtype='object') - >>> idx.notna() - array([ True, True, True, False]) - """ - return ~self.isna() - notnull = notna - def putmask(self, mask, value): """ Return a new Index of the values set with the mask. @@ -4806,58 +4865,6 @@ def duplicated(self, keep='first'): """ return super(Index, self).duplicated(keep=keep) - _index_shared_docs['fillna'] = """ - Fill NA/NaN values with the specified value - - Parameters - ---------- - value : scalar - Scalar value to use to fill holes (e.g. 0). - This value cannot be a list-likes. - downcast : dict, default is None - a dict of item->dtype of what to downcast if possible, - or the string 'infer' which will try to downcast to an appropriate - equal type (e.g. float64 to int64 if possible) - - Returns - ------- - filled : %(klass)s - """ - - @Appender(_index_shared_docs['fillna']) - def fillna(self, value=None, downcast=None): - self._assert_can_do_op(value) - if self.hasnans: - result = self.putmask(self._isnan, value) - if downcast is None: - # no need to care metadata other than name - # because it can't have freq if - return Index(result, name=self.name) - return self._shallow_copy() - - _index_shared_docs['dropna'] = """ - Return Index without NA/NaN values - - Parameters - ---------- - how : {'any', 'all'}, default 'any' - If the Index is a MultiIndex, drop the value when any or all levels - are NaN. - - Returns - ------- - valid : Index - """ - - @Appender(_index_shared_docs['dropna']) - def dropna(self, how='any'): - if how not in ('any', 'all'): - raise ValueError("invalid how option: {0}".format(how)) - - if self.hasnans: - return self._shallow_copy(self.values[~self._isnan]) - return self._shallow_copy() - def _evaluate_with_timedelta_like(self, other, op): # Timedelta knows how to operate with np.array, so dispatch to that # operation and then wrap the results From 8c26d6f62b8def4fbe12534fc2cd96f0ff7b6426 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 27 Nov 2018 16:50:51 -0800 Subject: [PATCH 04/10] revert --- pandas/core/indexes/base.py | 2450 +++++++++++++++++------------------ 1 file changed, 1200 insertions(+), 1250 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index fbb78dd5a7343..22c348acaf341 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -244,9 +244,6 @@ def _outer_indexer(self, left, right): str = CachedAccessor("str", StringMethods) - # -------------------------------------------------------------------- - # Constructors - def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=None, tupleize_cols=True, **kwargs): @@ -521,100 +518,6 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): setattr(result, k, v) return result._reset_identity() - @cache_readonly - def _constructor(self): - return type(self) - - # -------------------------------------------------------------------- - # Construction helpers - - @classmethod - def _try_convert_to_int_index(cls, data, copy, name, dtype): - """ - Attempt to convert an array of data into an integer index. - - Parameters - ---------- - data : The data to convert. - copy : Whether to copy the data or not. - name : The name of the index returned. - - Returns - ------- - int_index : data converted to either an Int64Index or a - UInt64Index - - Raises - ------ - ValueError if the conversion was not successful. - """ - - from .numeric import Int64Index, UInt64Index - if not is_unsigned_integer_dtype(dtype): - # skip int64 conversion attempt if uint-like dtype is passed, as - # this could return Int64Index when UInt64Index is what's desrired - try: - res = data.astype('i8', copy=False) - if (res == data).all(): - return Int64Index(res, copy=copy, name=name) - except (OverflowError, TypeError, ValueError): - pass - - # Conversion to int64 failed (possibly due to overflow) or was skipped, - # so let's try now with uint64. - try: - res = data.astype('u8', copy=False) - if (res == data).all(): - return UInt64Index(res, copy=copy, name=name) - except (OverflowError, TypeError, ValueError): - pass - - raise ValueError - - @classmethod - def _scalar_data_error(cls, data): - raise TypeError('{0}(...) must be called with a collection of some ' - 'kind, {1} was passed'.format(cls.__name__, - repr(data))) - - @classmethod - def _string_data_error(cls, data): - raise TypeError('String dtype not supported, you may need ' - 'to explicitly cast to a numeric type') - - @classmethod - def _coerce_to_ndarray(cls, data): - """ - Coerces data to ndarray. - - Converts other iterables to list first and then to array. - Does not touch ndarrays. - - Raises - ------ - TypeError - When the data passed in is a scalar. - """ - - if not isinstance(data, (np.ndarray, Index)): - if data is None or is_scalar(data): - cls._scalar_data_error(data) - - # other iterable of some kind - if not isinstance(data, (ABCSeries, list, tuple)): - data = list(data) - data = np.asarray(data) - return data - - # -------------------------------------------------------------------- - # Copying Methods - - def _get_attributes_dict(self): - """ - Return an attributes dict for my class. - """ - return {k: getattr(self, k, None) for k in self._attributes} - _index_shared_docs['_shallow_copy'] = """ Create a new Index with the same class as the caller, don't copy the data, use the same object attributes with passed in attributes taking @@ -701,114 +604,45 @@ def _deepcopy_if_needed(self, orig, copy=False): return self - _index_shared_docs['copy'] = """ - Make a copy of this object. Name and dtype sets those attributes on - the new object. - - Parameters - ---------- - name : string, optional - deep : boolean, default False - dtype : numpy dtype or pandas type - - Returns - ------- - copy : Index + def _update_inplace(self, result, **kwargs): + # guard when called from IndexOpsMixin + raise TypeError("Index can't be updated inplace") - Notes - ----- - In most cases, there should be no functional difference from using - ``deep``, but if ``deep`` is passed it will attempt to deepcopy. + def _sort_levels_monotonic(self): """ + Compat with MultiIndex. + """ + return self - @Appender(_index_shared_docs['copy']) - def copy(self, name=None, deep=False, dtype=None, **kwargs): - if deep: - new_index = self._shallow_copy(self._data.copy()) - else: - new_index = self._shallow_copy() - - names = kwargs.get('names') - names = self._validate_names(name=name, names=names, deep=deep) - new_index = new_index.set_names(names) - - if dtype: - new_index = new_index.astype(dtype) - return new_index - - def __copy__(self, **kwargs): - return self.copy(**kwargs) + _index_shared_docs['_get_grouper_for_level'] = """ + Get index grouper corresponding to an index level - def __deepcopy__(self, memo=None): - """ Parameters ---------- - memo, default None - Standard signature. Unused - """ - if memo is None: - memo = {} - return self.copy(deep=True) - - # -------------------------------------------------------------------- - # PandasObject/IndexOpsMixin Compat - - @property - def values(self): - """ - Return the underlying data as an ndarray. - """ - return self._data.view(np.ndarray) - - @property - def _values(self): - # type: () -> Union[ExtensionArray, Index, np.ndarray] - # TODO(EA): remove index types as they become extension arrays - """ - The best array representation. - - This is an ndarray, ExtensionArray, or Index subclass. This differs - from ``_ndarray_values``, which always returns an ndarray. - - Both ``_values`` and ``_ndarray_values`` are consistent between - ``Series`` and ``Index``. - - It may differ from the public '.values' method. - - index | values | _values | _ndarray_values | - ----------------- | --------------- | ------------- | --------------- | - Index | ndarray | ndarray | ndarray | - CategoricalIndex | Categorical | Categorical | ndarray[int] | - DatetimeIndex | ndarray[M8ns] | ndarray[M8ns] | ndarray[M8ns] | - DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] | ndarray[M8ns] | - PeriodIndex | ndarray[object] | PeriodArray | ndarray[int] | - IntervalIndex | IntervalArray | IntervalArray | ndarray[object] | + mapper: Group mapping function or None + Function mapping index values to groups + level : int or None + Index level - See Also - -------- - values - _ndarray_values + Returns + ------- + grouper : Index + Index of values to group on + labels : ndarray of int or None + Array of locations in level_index + uniques : Index or None + Index of unique values for level """ - return self.values - - @Appender(IndexOpsMixin.memory_usage.__doc__) - def memory_usage(self, deep=False): - result = super(Index, self).memory_usage(deep=deep) - - # include our engine hashtable - result += self._engine.sizeof(deep=deep) - return result - - # -------------------------------------------------------------------- - # Engine/Identity Methods - @cache_readonly - def _engine(self): - # property, for now, slow to look up - return self._engine_type(lambda: self._ndarray_values, len(self)) + @Appender(_index_shared_docs['_get_grouper_for_level']) + def _get_grouper_for_level(self, mapper, level=None): + assert level is None or level == 0 + if mapper is None: + grouper = self + else: + grouper = self.map(mapper) - def _cleanup(self): - self._engine.clear_mapping() + return grouper, None, None def is_(self, other): """ @@ -837,9 +671,6 @@ def _reset_identity(self): self._id = _Identity() return self - # -------------------------------------------------------------------- - # Array-Like Methods - # ndarray compat def __len__(self): """ @@ -864,144 +695,57 @@ def __array_wrap__(self, result, context=None): attrs = self._maybe_update_attributes(attrs) return Index(result, **attrs) - def ravel(self, order='C'): + @cache_readonly + def dtype(self): """ - Return an ndarray of the flattened values of the underlying data. + Return the dtype object of the underlying data. + """ + return self._data.dtype - See Also - -------- - numpy.ndarray.ravel + @cache_readonly + def dtype_str(self): """ - return self._ndarray_values.ravel(order=order) + Return the dtype str of the underlying data. + """ + return str(self.dtype) - def view(self, cls=None): + @property + def values(self): + """ + Return the underlying data as an ndarray. + """ + return self._data.view(np.ndarray) - # we need to see if we are subclassing an - # index type here - if cls is not None and not hasattr(cls, '_typ'): - result = self._data.view(cls) - else: - result = self._shallow_copy() - if isinstance(result, Index): - result._id = self._id - return result + @property + def _values(self): + # type: () -> Union[ExtensionArray, Index, np.ndarray] + # TODO(EA): remove index types as they become extension arrays + """ + The best array representation. - _index_shared_docs['astype'] = """ - Create an Index with values cast to dtypes. The class of a new Index - is determined by dtype. When conversion is impossible, a ValueError - exception is raised. - - Parameters - ---------- - dtype : numpy dtype or pandas type - copy : bool, default True - By default, astype always returns a newly allocated object. - If copy is set to False and internal requirements on dtype are - satisfied, the original data is used to create a new Index - or the original Index is returned. - - .. versionadded:: 0.19.0 - """ - - @Appender(_index_shared_docs['astype']) - def astype(self, dtype, copy=True): - if is_dtype_equal(self.dtype, dtype): - return self.copy() if copy else self - - elif is_categorical_dtype(dtype): - from .category import CategoricalIndex - return CategoricalIndex(self.values, name=self.name, dtype=dtype, - copy=copy) - - elif is_extension_array_dtype(dtype): - return Index(np.asarray(self), dtype=dtype, copy=copy) - - try: - if is_datetime64tz_dtype(dtype): - from pandas import DatetimeIndex - return DatetimeIndex(self.values, name=self.name, dtype=dtype, - copy=copy) - return Index(self.values.astype(dtype, copy=copy), name=self.name, - dtype=dtype) - except (TypeError, ValueError): - msg = 'Cannot cast {name} to dtype {dtype}' - raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) - - @cache_readonly - def dtype(self): - """ - Return the dtype object of the underlying data. - """ - return self._data.dtype - - @cache_readonly - def dtype_str(self): - """ - Return the dtype str of the underlying data. - """ - return str(self.dtype) + This is an ndarray, ExtensionArray, or Index subclass. This differs + from ``_ndarray_values``, which always returns an ndarray. - _index_shared_docs['take'] = """ - Return a new %(klass)s of the values selected by the indices. + Both ``_values`` and ``_ndarray_values`` are consistent between + ``Series`` and ``Index``. - For internal compatibility with numpy arrays. + It may differ from the public '.values' method. - Parameters - ---------- - indices : list - Indices to be taken - axis : int, optional - The axis over which to select values, always 0. - allow_fill : bool, default True - fill_value : bool, default None - If allow_fill=True and fill_value is not None, indices specified by - -1 is regarded as NA. If Index doesn't hold NA, raise ValueError + index | values | _values | _ndarray_values | + ----------------- | --------------- | ------------- | --------------- | + Index | ndarray | ndarray | ndarray | + CategoricalIndex | Categorical | Categorical | ndarray[int] | + DatetimeIndex | ndarray[M8ns] | ndarray[M8ns] | ndarray[M8ns] | + DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] | ndarray[M8ns] | + PeriodIndex | ndarray[object] | PeriodArray | ndarray[int] | + IntervalIndex | IntervalArray | IntervalArray | ndarray[object] | See Also -------- - numpy.ndarray.take - """ - - @Appender(_index_shared_docs['take'] % _index_doc_kwargs) - def take(self, indices, axis=0, allow_fill=True, - fill_value=None, **kwargs): - if kwargs: - nv.validate_take(tuple(), kwargs) - indices = ensure_platform_int(indices) - if self._can_hold_na: - taken = self._assert_take_fillable(self.values, indices, - allow_fill=allow_fill, - fill_value=fill_value, - na_value=self._na_value) - else: - if allow_fill and fill_value is not None: - msg = 'Unable to fill values because {0} cannot contain NA' - raise ValueError(msg.format(self.__class__.__name__)) - taken = self.values.take(indices) - return self._shallow_copy(taken) - - def _assert_take_fillable(self, values, indices, allow_fill=True, - fill_value=None, na_value=np.nan): - """ - Internal method to handle NA filling of take. + values + _ndarray_values """ - indices = ensure_platform_int(indices) - - # only fill if we are passing a non-None fill_value - if allow_fill and fill_value is not None: - if (indices < -1).any(): - msg = ('When allow_fill=True and fill_value is not None, ' - 'all indices must be >= -1') - raise ValueError(msg) - taken = algos.take(values, - indices, - allow_fill=allow_fill, - fill_value=na_value) - else: - taken = values.take(indices) - return taken - - # -------------------------------------------------------------------- + return self.values def get_values(self): """ @@ -1047,9 +791,13 @@ def get_values(self): """ return self.values - def _update_inplace(self, result, **kwargs): - # guard when called from IndexOpsMixin - raise TypeError("Index can't be updated inplace") + @Appender(IndexOpsMixin.memory_usage.__doc__) + def memory_usage(self, deep=False): + result = super(Index, self).memory_usage(deep=deep) + + # include our engine hashtable + result += self._engine.sizeof(deep=deep) + return result # ops compat def repeat(self, repeats, *args, **kwargs): @@ -1126,171 +874,178 @@ def where(self, cond, other=None): return self._shallow_copy_with_infer(values, dtype=dtype) - def _coerce_scalar_to_index(self, item): + def ravel(self, order='C'): """ - We need to coerce a scalar to a compat for our index type. + Return an ndarray of the flattened values of the underlying data. - Parameters - ---------- - item : scalar item to coerce + See Also + -------- + numpy.ndarray.ravel """ - dtype = self.dtype - - if self._is_numeric_dtype and isna(item): - # We can't coerce to the numeric dtype of "self" (unless - # it's float) if there are NaN values in our output. - dtype = None - - return Index([item], dtype=dtype, **self._get_attributes_dict()) - - # -------------------------------------------------------------------- - # Rendering Methods + return self._ndarray_values.ravel(order=order) - def __unicode__(self): + # construction helpers + @classmethod + def _try_convert_to_int_index(cls, data, copy, name, dtype): """ - Return a string representation for this object. + Attempt to convert an array of data into an integer index. - Invoked by unicode(df) in py2 only. Yields a Unicode String in both - py2/py3. - """ - klass = self.__class__.__name__ - data = self._format_data() - attrs = self._format_attrs() - space = self._format_space() + Parameters + ---------- + data : The data to convert. + copy : Whether to copy the data or not. + name : The name of the index returned. - prepr = (u(",%s") % - space).join(u("%s=%s") % (k, v) for k, v in attrs) + Returns + ------- + int_index : data converted to either an Int64Index or a + UInt64Index - # no data provided, just attributes - if data is None: - data = '' + Raises + ------ + ValueError if the conversion was not successful. + """ - res = u("%s(%s%s)") % (klass, data, prepr) + from .numeric import Int64Index, UInt64Index + if not is_unsigned_integer_dtype(dtype): + # skip int64 conversion attempt if uint-like dtype is passed, as + # this could return Int64Index when UInt64Index is what's desrired + try: + res = data.astype('i8', copy=False) + if (res == data).all(): + return Int64Index(res, copy=copy, name=name) + except (OverflowError, TypeError, ValueError): + pass - return res + # Conversion to int64 failed (possibly due to overflow) or was skipped, + # so let's try now with uint64. + try: + res = data.astype('u8', copy=False) + if (res == data).all(): + return UInt64Index(res, copy=copy, name=name) + except (OverflowError, TypeError, ValueError): + pass - def _format_space(self): + raise ValueError - # using space here controls if the attributes - # are line separated or not (the default) + @classmethod + def _scalar_data_error(cls, data): + raise TypeError('{0}(...) must be called with a collection of some ' + 'kind, {1} was passed'.format(cls.__name__, + repr(data))) - # max_seq_items = get_option('display.max_seq_items') - # if len(self) > max_seq_items: - # space = "\n%s" % (' ' * (len(klass) + 1)) - return " " + @classmethod + def _string_data_error(cls, data): + raise TypeError('String dtype not supported, you may need ' + 'to explicitly cast to a numeric type') - @property - def _formatter_func(self): - """ - Return the formatter function. + @classmethod + def _coerce_to_ndarray(cls, data): """ - return default_pprint + Coerces data to ndarray. - def _format_data(self, name=None): - """ - Return the formatted data as a unicode string. + Converts other iterables to list first and then to array. + Does not touch ndarrays. + + Raises + ------ + TypeError + When the data passed in is a scalar. """ - # do we want to justify (only do so for non-objects) - is_justify = not (self.inferred_type in ('string', 'unicode') or - (self.inferred_type == 'categorical' and - is_object_dtype(self.categories))) + if not isinstance(data, (np.ndarray, Index)): + if data is None or is_scalar(data): + cls._scalar_data_error(data) - return format_object_summary(self, self._formatter_func, - is_justify=is_justify, name=name) + # other iterable of some kind + if not isinstance(data, (ABCSeries, list, tuple)): + data = list(data) + data = np.asarray(data) + return data - def _format_attrs(self): + def _get_attributes_dict(self): """ - Return a list of tuples of the (attr,formatted_value). + Return an attributes dict for my class. """ - return format_object_attrs(self) + return {k: getattr(self, k, None) for k in self._attributes} - def _mpl_repr(self): - # how to represent ourselves to matplotlib - return self.values + def view(self, cls=None): - def format(self, name=False, formatter=None, **kwargs): - """ - Render a string representation of the Index. - """ - header = [] - if name: - header.append(pprint_thing(self.name, - escape_chars=('\t', '\r', '\n')) if - self.name is not None else '') + # we need to see if we are subclassing an + # index type here + if cls is not None and not hasattr(cls, '_typ'): + result = self._data.view(cls) + else: + result = self._shallow_copy() + if isinstance(result, Index): + result._id = self._id + return result - if formatter is not None: - return header + list(self.map(formatter)) + def _coerce_scalar_to_index(self, item): + """ + We need to coerce a scalar to a compat for our index type. - return self._format_with_header(header, **kwargs) + Parameters + ---------- + item : scalar item to coerce + """ + dtype = self.dtype - def _format_with_header(self, header, na_rep='NaN', **kwargs): - values = self.values + if self._is_numeric_dtype and isna(item): + # We can't coerce to the numeric dtype of "self" (unless + # it's float) if there are NaN values in our output. + dtype = None - from pandas.io.formats.format import format_array + return Index([item], dtype=dtype, **self._get_attributes_dict()) - if is_categorical_dtype(values.dtype): - values = np.array(values) + _index_shared_docs['copy'] = """ + Make a copy of this object. Name and dtype sets those attributes on + the new object. - elif is_object_dtype(values.dtype): - values = lib.maybe_convert_objects(values, safe=1) + Parameters + ---------- + name : string, optional + deep : boolean, default False + dtype : numpy dtype or pandas type - if is_object_dtype(values.dtype): - result = [pprint_thing(x, escape_chars=('\t', '\r', '\n')) - for x in values] + Returns + ------- + copy : Index - # could have nans - mask = isna(values) - if mask.any(): - result = np.array(result) - result[mask] = na_rep - result = result.tolist() + Notes + ----- + In most cases, there should be no functional difference from using + ``deep``, but if ``deep`` is passed it will attempt to deepcopy. + """ + @Appender(_index_shared_docs['copy']) + def copy(self, name=None, deep=False, dtype=None, **kwargs): + if deep: + new_index = self._shallow_copy(self._data.copy()) else: - result = _trim_front(format_array(values, None, justify='left')) - return header + result - - def to_native_types(self, slicer=None, **kwargs): - """ - Format specified values of `self` and return them. + new_index = self._shallow_copy() - Parameters - ---------- - slicer : int, array-like - An indexer into `self` that specifies which values - are used in the formatting process. - kwargs : dict - Options for specifying how the values should be formatted. - These options include the following: + names = kwargs.get('names') + names = self._validate_names(name=name, names=names, deep=deep) + new_index = new_index.set_names(names) - 1) na_rep : str - The value that serves as a placeholder for NULL values - 2) quoting : bool or None - Whether or not there are quoted values in `self` - 3) date_format : str - The format used to represent date-like values - """ + if dtype: + new_index = new_index.astype(dtype) + return new_index - values = self - if slicer is not None: - values = values[slicer] - return values._format_native_types(**kwargs) + def __copy__(self, **kwargs): + return self.copy(**kwargs) - def _format_native_types(self, na_rep='', quoting=None, **kwargs): + def __deepcopy__(self, memo=None): """ - Actually format specific types of the index. + Parameters + ---------- + memo, default None + Standard signature. Unused """ - mask = isna(self) - if not self.is_object() and not quoting: - values = np.asarray(self).astype(str) - else: - values = np.array(self, dtype=object, copy=True) - - values[mask] = na_rep - return values - - # -------------------------------------------------------------------- - # Name-Related Methods + if memo is None: + memo = {} + return self.copy(deep=True) def _validate_names(self, name=None, names=None, deep=False): """ @@ -1311,887 +1066,407 @@ def _validate_names(self, name=None, names=None, deep=False): return [name] return name - def _get_names(self): - return FrozenList((self.name, )) + def __unicode__(self): + """ + Return a string representation for this object. - def _set_names(self, values, level=None): + Invoked by unicode(df) in py2 only. Yields a Unicode String in both + py2/py3. """ - Set new names on index. Each name has to be a hashable type. + klass = self.__class__.__name__ + data = self._format_data() + attrs = self._format_attrs() + space = self._format_space() - Parameters - ---------- - values : str or sequence - name(s) to set - level : int, level name, or sequence of int/level names (default None) - If the index is a MultiIndex (hierarchical), level(s) to set (None - for all levels). Otherwise level must be None + prepr = (u(",%s") % + space).join(u("%s=%s") % (k, v) for k, v in attrs) - Raises - ------ - TypeError if each name is not hashable. + # no data provided, just attributes + if data is None: + data = '' + + res = u("%s(%s%s)") % (klass, data, prepr) + + return res + + def _format_space(self): + + # using space here controls if the attributes + # are line separated or not (the default) + + # max_seq_items = get_option('display.max_seq_items') + # if len(self) > max_seq_items: + # space = "\n%s" % (' ' * (len(klass) + 1)) + return " " + + @property + def _formatter_func(self): """ - if not is_list_like(values): - raise ValueError('Names must be a list-like') - if len(values) != 1: - raise ValueError('Length of new names must be 1, got %d' % - len(values)) + Return the formatter function. + """ + return default_pprint - # GH 20527 - # All items in 'name' need to be hashable: - for name in values: - if not is_hashable(name): - raise TypeError('{}.name must be a hashable type' - .format(self.__class__.__name__)) - self.name = values[0] + def _format_data(self, name=None): + """ + Return the formatted data as a unicode string. + """ - names = property(fset=_set_names, fget=_get_names) + # do we want to justify (only do so for non-objects) + is_justify = not (self.inferred_type in ('string', 'unicode') or + (self.inferred_type == 'categorical' and + is_object_dtype(self.categories))) - def set_names(self, names, level=None, inplace=False): + return format_object_summary(self, self._formatter_func, + is_justify=is_justify, name=name) + + def _format_attrs(self): """ - Set Index or MultiIndex name. + Return a list of tuples of the (attr,formatted_value). + """ + return format_object_attrs(self) - Able to set new names partially and by level. + def to_flat_index(self): + """ + Identity method. - Parameters - ---------- - names : label or list of label - Name(s) to set. - level : int, label or list of int or label, optional - If the index is a MultiIndex, level(s) to set (None for all - levels). Otherwise level must be None. - inplace : bool, default False - Modifies the object directly, instead of creating a new Index or - MultiIndex. + .. versionadded:: 0.24.0 + + This is implemented for compatability with subclass implementations + when chaining. Returns ------- - Index - The same type as the caller or None if inplace is True. + pd.Index + Caller. See Also -------- - Index.rename : Able to set new names without level. - - Examples - -------- - >>> idx = pd.Index([1, 2, 3, 4]) - >>> idx - Int64Index([1, 2, 3, 4], dtype='int64') - >>> idx.set_names('quarter') - Int64Index([1, 2, 3, 4], dtype='int64', name='quarter') + MultiIndex.to_flat_index : Subclass implementation. + """ + return self - >>> idx = pd.MultiIndex.from_product([['python', 'cobra'], - ... [2018, 2019]]) - >>> idx - MultiIndex(levels=[['cobra', 'python'], [2018, 2019]], - labels=[[1, 1, 0, 0], [0, 1, 0, 1]]) - >>> idx.set_names(['kind', 'year'], inplace=True) - >>> idx - MultiIndex(levels=[['cobra', 'python'], [2018, 2019]], - labels=[[1, 1, 0, 0], [0, 1, 0, 1]], - names=['kind', 'year']) - >>> idx.set_names('species', level=0) - MultiIndex(levels=[['cobra', 'python'], [2018, 2019]], - labels=[[1, 1, 0, 0], [0, 1, 0, 1]], - names=['species', 'year']) + def to_series(self, index=None, name=None): """ + Create a Series with both index and values equal to the index keys + useful with map for returning an indexer based on an index. - from .multi import MultiIndex - if level is not None and not isinstance(self, MultiIndex): - raise ValueError('Level must be None for non-MultiIndex') + Parameters + ---------- + index : Index, optional + index of resulting Series. If None, defaults to original index + name : string, optional + name of resulting Series. If None, defaults to name of original + index - if level is not None and not is_list_like(level) and is_list_like( - names): - msg = "Names must be a string when a single level is provided." - raise TypeError(msg) + Returns + ------- + Series : dtype will be based on the type of the Index values. + """ - if not is_list_like(names) and level is None and self.nlevels > 1: - raise TypeError("Must pass list-like as `names`.") + from pandas import Series - if not is_list_like(names): - names = [names] - if level is not None and not is_list_like(level): - level = [level] + if index is None: + index = self._shallow_copy() + if name is None: + name = self.name - if inplace: - idx = self - else: - idx = self._shallow_copy() - idx._set_names(names, level=level) - if not inplace: - return idx + return Series(self.values.copy(), index=index, name=name) - def rename(self, name, inplace=False): + def to_frame(self, index=True, name=None): """ - Alter Index or MultiIndex name. + Create a DataFrame with a column containing the Index. - Able to set new names without level. Defaults to returning new index. - Length of names must match number of levels in MultiIndex. + .. versionadded:: 0.24.0 Parameters ---------- - name : label or list of labels - Name(s) to set. - inplace : boolean, default False - Modifies the object directly, instead of creating a new Index or - MultiIndex. + index : boolean, default True + Set the index of the returned DataFrame as the original Index. + + name : object, default None + The passed name should substitute for the index name (if it has + one). Returns ------- - Index - The same type as the caller or None if inplace is True. + DataFrame + DataFrame containing the original Index data. See Also -------- - Index.set_names : Able to set new names partially and by level. + Index.to_series : Convert an Index to a Series. + Series.to_frame : Convert Series to DataFrame. Examples -------- - >>> idx = pd.Index(['A', 'C', 'A', 'B'], name='score') - >>> idx.rename('grade') - Index(['A', 'C', 'A', 'B'], dtype='object', name='grade') + >>> idx = pd.Index(['Ant', 'Bear', 'Cow'], name='animal') + >>> idx.to_frame() + animal + animal + Ant Ant + Bear Bear + Cow Cow - >>> idx = pd.MultiIndex.from_product([['python', 'cobra'], - ... [2018, 2019]], - ... names=['kind', 'year']) - >>> idx - MultiIndex(levels=[['cobra', 'python'], [2018, 2019]], - labels=[[1, 1, 0, 0], [0, 1, 0, 1]], - names=['kind', 'year']) - >>> idx.rename(['species', 'year']) - MultiIndex(levels=[['cobra', 'python'], [2018, 2019]], - labels=[[1, 1, 0, 0], [0, 1, 0, 1]], - names=['species', 'year']) - >>> idx.rename('species') - Traceback (most recent call last): - TypeError: Must pass list-like as `names`. - """ - return self.set_names([name], inplace=inplace) + By default, the original Index is reused. To enforce a new Index: - # -------------------------------------------------------------------- - # Level-Related Methods + >>> idx.to_frame(index=False) + animal + 0 Ant + 1 Bear + 2 Cow - @property - def nlevels(self): - return 1 + To override the name of the resulting column, specify `name`: - def _sort_levels_monotonic(self): - """ - Compat with MultiIndex. + >>> idx.to_frame(index=False, name='zoo') + zoo + 0 Ant + 1 Bear + 2 Cow """ - return self - _index_shared_docs['_get_grouper_for_level'] = """ - Get index grouper corresponding to an index level + from pandas import DataFrame + if name is None: + name = self.name or 0 + result = DataFrame({name: self.values.copy()}) + + if index: + result.index = self + return result + + _index_shared_docs['astype'] = """ + Create an Index with values cast to dtypes. The class of a new Index + is determined by dtype. When conversion is impossible, a ValueError + exception is raised. Parameters ---------- - mapper: Group mapping function or None - Function mapping index values to groups - level : int or None - Index level + dtype : numpy dtype or pandas type + copy : bool, default True + By default, astype always returns a newly allocated object. + If copy is set to False and internal requirements on dtype are + satisfied, the original data is used to create a new Index + or the original Index is returned. - Returns - ------- - grouper : Index - Index of values to group on - labels : ndarray of int or None - Array of locations in level_index - uniques : Index or None - Index of unique values for level + .. versionadded:: 0.19.0 """ - @Appender(_index_shared_docs['_get_grouper_for_level']) - def _get_grouper_for_level(self, mapper, level=None): - assert level is None or level == 0 - if mapper is None: - grouper = self - else: - grouper = self.map(mapper) + @Appender(_index_shared_docs['astype']) + def astype(self, dtype, copy=True): + if is_dtype_equal(self.dtype, dtype): + return self.copy() if copy else self - return grouper, None, None + elif is_categorical_dtype(dtype): + from .category import CategoricalIndex + return CategoricalIndex(self.values, name=self.name, dtype=dtype, + copy=copy) - def _validate_index_level(self, level): - """ - Validate index level. + elif is_extension_array_dtype(dtype): + return Index(np.asarray(self), dtype=dtype, copy=copy) - For single-level Index getting level number is a no-op, but some - verification must be done like in MultiIndex. + try: + if is_datetime64tz_dtype(dtype): + from pandas import DatetimeIndex + return DatetimeIndex(self.values, name=self.name, dtype=dtype, + copy=copy) + return Index(self.values.astype(dtype, copy=copy), name=self.name, + dtype=dtype) + except (TypeError, ValueError): + msg = 'Cannot cast {name} to dtype {dtype}' + raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) + def _to_safe_for_reshape(self): """ - if isinstance(level, int): - if level < 0 and level != -1: - raise IndexError("Too many levels: Index has only 1 level," - " %d is not a valid level number" % (level, )) - elif level > 0: - raise IndexError("Too many levels:" - " Index has only 1 level, not %d" % - (level + 1)) - elif level != self.name: - raise KeyError('Level %s must be same as name (%s)' % - (level, self.name)) - - def _get_level_number(self, level): - self._validate_index_level(level) - return 0 - - def sortlevel(self, level=None, ascending=True, sort_remaining=None): + Convert to object if we are a categorical. """ - For internal compatibility with with the Index API. - - Sort the Index. This is for compat with MultiIndex + return self - Parameters - ---------- - ascending : boolean, default True - False to sort in descending order + def _assert_can_do_setop(self, other): + if not is_list_like(other): + raise TypeError('Input must be Index or array-like') + return True - level, sort_remaining are compat parameters + def _convert_can_do_setop(self, other): + if not isinstance(other, Index): + other = Index(other, name=self.name) + result_name = self.name + else: + result_name = get_op_result_name(self, other) + return other, result_name - Returns - ------- - sorted_index : Index + def _convert_for_op(self, value): """ - return self.sort_values(return_indexer=True, ascending=ascending) + Convert value to be insertable to ndarray. + """ + return value - def _get_level_values(self, level): + def _assert_can_do_op(self, value): """ - Return an Index of values for requested level. + Check value is valid for scalar op. + """ + if not is_scalar(value): + msg = "'value' must be a scalar, passed: {0}" + raise TypeError(msg.format(type(value).__name__)) - This is primarily useful to get an individual level of values from a - MultiIndex, but is provided on Index as well for compatability. + @property + def nlevels(self): + return 1 + + def _get_names(self): + return FrozenList((self.name, )) + + def _set_names(self, values, level=None): + """ + Set new names on index. Each name has to be a hashable type. Parameters ---------- - level : int or str - It is either the integer position or the name of the level. - - Returns - ------- - values : Index - Calling object, as there is only one level in the Index. + values : str or sequence + name(s) to set + level : int, level name, or sequence of int/level names (default None) + If the index is a MultiIndex (hierarchical), level(s) to set (None + for all levels). Otherwise level must be None - See Also - -------- - MultiIndex.get_level_values : Get values for a level of a MultiIndex. - - Notes - ----- - For Index, level should be 0, since there are no multiple levels. - - Examples - -------- - - >>> idx = pd.Index(list('abc')) - >>> idx - Index(['a', 'b', 'c'], dtype='object') - - Get level values by supplying `level` as integer: - - >>> idx.get_level_values(0) - Index(['a', 'b', 'c'], dtype='object') - """ - self._validate_index_level(level) - return self - - get_level_values = _get_level_values - - def droplevel(self, level=0): - """ - Return index with requested level(s) removed. - - If resulting index has only 1 level left, the result will be - of Index type, not MultiIndex. - - .. versionadded:: 0.23.1 (support for non-MultiIndex) - - Parameters - ---------- - level : int, str, or list-like, default 0 - If a string is given, must be the name of a level - If list-like, elements must be names or indexes of levels. - - Returns - ------- - index : Index or MultiIndex - """ - if not isinstance(level, (tuple, list)): - level = [level] - - levnums = sorted(self._get_level_number(lev) for lev in level)[::-1] - - if len(level) == 0: - return self - if len(level) >= self.nlevels: - raise ValueError("Cannot remove {} levels from an index with {} " - "levels: at least one level must be " - "left.".format(len(level), self.nlevels)) - # The two checks above guarantee that here self is a MultiIndex - - new_levels = list(self.levels) - new_labels = list(self.labels) - new_names = list(self.names) - - for i in levnums: - new_levels.pop(i) - new_labels.pop(i) - new_names.pop(i) - - if len(new_levels) == 1: - - # set nan if needed - mask = new_labels[0] == -1 - result = new_levels[0].take(new_labels[0]) - if mask.any(): - result = result.putmask(mask, np.nan) - - result.name = new_names[0] - return result - else: - from .multi import MultiIndex - return MultiIndex(levels=new_levels, labels=new_labels, - names=new_names, verify_integrity=False) - - # -------------------------------------------------------------------- - # Introspection Methods - - @property - def is_monotonic(self): - """ - Alias for is_monotonic_increasing. - """ - return self.is_monotonic_increasing - - @property - def is_monotonic_increasing(self): - """ - Return if the index is monotonic increasing (only equal or - increasing) values. - - Examples - -------- - >>> Index([1, 2, 3]).is_monotonic_increasing - True - >>> Index([1, 2, 2]).is_monotonic_increasing - True - >>> Index([1, 3, 2]).is_monotonic_increasing - False - """ - return self._engine.is_monotonic_increasing - - @property - def is_monotonic_decreasing(self): - """ - Return if the index is monotonic decreasing (only equal or - decreasing) values. - - Examples - -------- - >>> Index([3, 2, 1]).is_monotonic_decreasing - True - >>> Index([3, 2, 2]).is_monotonic_decreasing - True - >>> Index([3, 1, 2]).is_monotonic_decreasing - False - """ - return self._engine.is_monotonic_decreasing - - @property - def _is_strictly_monotonic_increasing(self): - """ - Return if the index is strictly monotonic increasing - (only increasing) values. - - Examples - -------- - >>> Index([1, 2, 3])._is_strictly_monotonic_increasing - True - >>> Index([1, 2, 2])._is_strictly_monotonic_increasing - False - >>> Index([1, 3, 2])._is_strictly_monotonic_increasing - False - """ - return self.is_unique and self.is_monotonic_increasing - - @property - def _is_strictly_monotonic_decreasing(self): - """ - Return if the index is strictly monotonic decreasing - (only decreasing) values. - - Examples - -------- - >>> Index([3, 2, 1])._is_strictly_monotonic_decreasing - True - >>> Index([3, 2, 2])._is_strictly_monotonic_decreasing - False - >>> Index([3, 1, 2])._is_strictly_monotonic_decreasing - False - """ - return self.is_unique and self.is_monotonic_decreasing - - def is_lexsorted_for_tuple(self, tup): - return True - - @cache_readonly - def is_unique(self): - """ - Return if the index has unique values. - """ - return self._engine.is_unique - - @property - def has_duplicates(self): - return not self.is_unique - - def is_boolean(self): - return self.inferred_type in ['boolean'] - - def is_integer(self): - return self.inferred_type in ['integer'] - - def is_floating(self): - return self.inferred_type in ['floating', 'mixed-integer-float'] - - def is_numeric(self): - return self.inferred_type in ['integer', 'floating'] - - def is_object(self): - return is_object_dtype(self.dtype) - - def is_categorical(self): - """ - Check if the Index holds categorical data. - - Returns - ------- - boolean - True if the Index is categorical. - - See Also - -------- - CategoricalIndex : Index for categorical data. - - Examples - -------- - >>> idx = pd.Index(["Watermelon", "Orange", "Apple", - ... "Watermelon"]).astype("category") - >>> idx.is_categorical() - True - - >>> idx = pd.Index([1, 3, 5, 7]) - >>> idx.is_categorical() - False - - >>> s = pd.Series(["Peter", "Victor", "Elisabeth", "Mar"]) - >>> s - 0 Peter - 1 Victor - 2 Elisabeth - 3 Mar - dtype: object - >>> s.index.is_categorical() - False - """ - return self.inferred_type in ['categorical'] - - def is_interval(self): - return self.inferred_type in ['interval'] - - def is_mixed(self): - return self.inferred_type in ['mixed'] - - def holds_integer(self): - return self.inferred_type in ['integer', 'mixed-integer'] - - @cache_readonly - def inferred_type(self): - """ - Return a string of the type inferred from the values. - """ - return lib.infer_dtype(self) - - @cache_readonly - def is_all_dates(self): - if self._data is None: - return False - return is_datetime_array(ensure_object(self.values)) - - # -------------------------------------------------------------------- - # Conversion Methods - - def to_flat_index(self): - """ - Identity method. - - .. versionadded:: 0.24.0 - - This is implemented for compatability with subclass implementations - when chaining. - - Returns - ------- - pd.Index - Caller. - - See Also - -------- - MultiIndex.to_flat_index : Subclass implementation. - """ - return self - - def to_series(self, index=None, name=None): - """ - Create a Series with both index and values equal to the index keys - useful with map for returning an indexer based on an index. - - Parameters - ---------- - index : Index, optional - index of resulting Series. If None, defaults to original index - name : string, optional - name of resulting Series. If None, defaults to name of original - index - - Returns - ------- - Series : dtype will be based on the type of the Index values. - """ - - from pandas import Series - - if index is None: - index = self._shallow_copy() - if name is None: - name = self.name - - return Series(self.values.copy(), index=index, name=name) - - def to_frame(self, index=True, name=None): - """ - Create a DataFrame with a column containing the Index. - - .. versionadded:: 0.24.0 - - Parameters - ---------- - index : boolean, default True - Set the index of the returned DataFrame as the original Index. - - name : object, default None - The passed name should substitute for the index name (if it has - one). - - Returns - ------- - DataFrame - DataFrame containing the original Index data. - - See Also - -------- - Index.to_series : Convert an Index to a Series. - Series.to_frame : Convert Series to DataFrame. - - Examples - -------- - >>> idx = pd.Index(['Ant', 'Bear', 'Cow'], name='animal') - >>> idx.to_frame() - animal - animal - Ant Ant - Bear Bear - Cow Cow - - By default, the original Index is reused. To enforce a new Index: - - >>> idx.to_frame(index=False) - animal - 0 Ant - 1 Bear - 2 Cow - - To override the name of the resulting column, specify `name`: - - >>> idx.to_frame(index=False, name='zoo') - zoo - 0 Ant - 1 Bear - 2 Cow - """ - - from pandas import DataFrame - if name is None: - name = self.name or 0 - result = DataFrame({name: self.values.copy()}) - - if index: - result.index = self - return result - - # -------------------------------------------------------------------- - # Pickle Methods - - def __reduce__(self): - d = dict(data=self._data) - d.update(self._get_attributes_dict()) - return _new_Index, (self.__class__, d), None - - def __setstate__(self, state): - """ - Necessary for making this object picklable. - """ - - if isinstance(state, dict): - self._data = state.pop('data') - for k, v in compat.iteritems(state): - setattr(self, k, v) - - elif isinstance(state, tuple): - - if len(state) == 2: - nd_state, own_state = state - data = np.empty(nd_state[1], dtype=nd_state[2]) - np.ndarray.__setstate__(data, nd_state) - self.name = own_state[0] - - else: # pragma: no cover - data = np.empty(state) - np.ndarray.__setstate__(data, state) - - self._data = data - self._reset_identity() - else: - raise Exception("invalid pickle state") - - _unpickle_compat = __setstate__ - - # -------------------------------------------------------------------- - # Null-Handling Methods - _na_value = np.nan - """The expected NA value to use with this index.""" - - @cache_readonly - def _isnan(self): - """ - Return if each value is NaN. + Raises + ------ + TypeError if each name is not hashable. """ - if self._can_hold_na: - return isna(self) - else: - # shouldn't reach to this condition by checking hasnans beforehand - values = np.empty(len(self), dtype=np.bool_) - values.fill(False) - return values + if not is_list_like(values): + raise ValueError('Names must be a list-like') + if len(values) != 1: + raise ValueError('Length of new names must be 1, got %d' % + len(values)) - @cache_readonly - def _nan_idxs(self): - if self._can_hold_na: - w, = self._isnan.nonzero() - return w - else: - return np.array([], dtype=np.int64) + # GH 20527 + # All items in 'name' need to be hashable: + for name in values: + if not is_hashable(name): + raise TypeError('{}.name must be a hashable type' + .format(self.__class__.__name__)) + self.name = values[0] - @cache_readonly - def hasnans(self): - """ - Return if I have any nans; enables various perf speedups. - """ - if self._can_hold_na: - return bool(self._isnan.any()) - else: - return False + names = property(fset=_set_names, fget=_get_names) - def isna(self): + def set_names(self, names, level=None, inplace=False): """ - Detect missing values. + Set Index or MultiIndex name. - Return a boolean same-sized object indicating if the values are NA. - NA values, such as ``None``, :attr:`numpy.NaN` or :attr:`pd.NaT`, get - mapped to ``True`` values. - Everything else get mapped to ``False`` values. Characters such as - empty strings `''` or :attr:`numpy.inf` are not considered NA values - (unless you set ``pandas.options.mode.use_inf_as_na = True``). + Able to set new names partially and by level. - .. versionadded:: 0.20.0 + Parameters + ---------- + names : label or list of label + Name(s) to set. + level : int, label or list of int or label, optional + If the index is a MultiIndex, level(s) to set (None for all + levels). Otherwise level must be None. + inplace : bool, default False + Modifies the object directly, instead of creating a new Index or + MultiIndex. Returns ------- - numpy.ndarray - A boolean array of whether my values are NA + Index + The same type as the caller or None if inplace is True. See Also -------- - pandas.Index.notna : Boolean inverse of isna. - pandas.Index.dropna : Omit entries with missing values. - pandas.isna : Top-level isna. - Series.isna : Detect missing values in Series object. + Index.rename : Able to set new names without level. Examples -------- - Show which entries in a pandas.Index are NA. The result is an - array. - - >>> idx = pd.Index([5.2, 6.0, np.NaN]) + >>> idx = pd.Index([1, 2, 3, 4]) >>> idx - Float64Index([5.2, 6.0, nan], dtype='float64') - >>> idx.isna() - array([False, False, True], dtype=bool) - - Empty strings are not considered NA values. None is considered an NA - value. + Int64Index([1, 2, 3, 4], dtype='int64') + >>> idx.set_names('quarter') + Int64Index([1, 2, 3, 4], dtype='int64', name='quarter') - >>> idx = pd.Index(['black', '', 'red', None]) + >>> idx = pd.MultiIndex.from_product([['python', 'cobra'], + ... [2018, 2019]]) >>> idx - Index(['black', '', 'red', None], dtype='object') - >>> idx.isna() - array([False, False, False, True], dtype=bool) - - For datetimes, `NaT` (Not a Time) is considered as an NA value. - - >>> idx = pd.DatetimeIndex([pd.Timestamp('1940-04-25'), - ... pd.Timestamp(''), None, pd.NaT]) + MultiIndex(levels=[['cobra', 'python'], [2018, 2019]], + labels=[[1, 1, 0, 0], [0, 1, 0, 1]]) + >>> idx.set_names(['kind', 'year'], inplace=True) >>> idx - DatetimeIndex(['1940-04-25', 'NaT', 'NaT', 'NaT'], - dtype='datetime64[ns]', freq=None) - >>> idx.isna() - array([False, True, True, True], dtype=bool) - """ - return self._isnan - isnull = isna - - def notna(self): + MultiIndex(levels=[['cobra', 'python'], [2018, 2019]], + labels=[[1, 1, 0, 0], [0, 1, 0, 1]], + names=['kind', 'year']) + >>> idx.set_names('species', level=0) + MultiIndex(levels=[['cobra', 'python'], [2018, 2019]], + labels=[[1, 1, 0, 0], [0, 1, 0, 1]], + names=['species', 'year']) """ - Detect existing (non-missing) values. - - Return a boolean same-sized object indicating if the values are not NA. - Non-missing values get mapped to ``True``. Characters such as empty - strings ``''`` or :attr:`numpy.inf` are not considered NA values - (unless you set ``pandas.options.mode.use_inf_as_na = True``). - NA values, such as None or :attr:`numpy.NaN`, get mapped to ``False`` - values. - - .. versionadded:: 0.20.0 - - Returns - ------- - numpy.ndarray - Boolean array to indicate which entries are not NA. - - See Also - -------- - Index.notnull : Alias of notna. - Index.isna: Inverse of notna. - pandas.notna : Top-level notna. - - Examples - -------- - Show which entries in an Index are not NA. The result is an - array. - >>> idx = pd.Index([5.2, 6.0, np.NaN]) - >>> idx - Float64Index([5.2, 6.0, nan], dtype='float64') - >>> idx.notna() - array([ True, True, False]) + from .multi import MultiIndex + if level is not None and not isinstance(self, MultiIndex): + raise ValueError('Level must be None for non-MultiIndex') - Empty strings are not considered NA values. None is considered a NA - value. + if level is not None and not is_list_like(level) and is_list_like( + names): + msg = "Names must be a string when a single level is provided." + raise TypeError(msg) - >>> idx = pd.Index(['black', '', 'red', None]) - >>> idx - Index(['black', '', 'red', None], dtype='object') - >>> idx.notna() - array([ True, True, True, False]) - """ - return ~self.isna() - notnull = notna + if not is_list_like(names) and level is None and self.nlevels > 1: + raise TypeError("Must pass list-like as `names`.") - _index_shared_docs['fillna'] = """ - Fill NA/NaN values with the specified value + if not is_list_like(names): + names = [names] + if level is not None and not is_list_like(level): + level = [level] - Parameters - ---------- - value : scalar - Scalar value to use to fill holes (e.g. 0). - This value cannot be a list-likes. - downcast : dict, default is None - a dict of item->dtype of what to downcast if possible, - or the string 'infer' which will try to downcast to an appropriate - equal type (e.g. float64 to int64 if possible) + if inplace: + idx = self + else: + idx = self._shallow_copy() + idx._set_names(names, level=level) + if not inplace: + return idx - Returns - ------- - filled : %(klass)s + def rename(self, name, inplace=False): """ + Alter Index or MultiIndex name. - @Appender(_index_shared_docs['fillna']) - def fillna(self, value=None, downcast=None): - self._assert_can_do_op(value) - if self.hasnans: - result = self.putmask(self._isnan, value) - if downcast is None: - # no need to care metadata other than name - # because it can't have freq if - return Index(result, name=self.name) - return self._shallow_copy() - - _index_shared_docs['dropna'] = """ - Return Index without NA/NaN values + Able to set new names without level. Defaults to returning new index. + Length of names must match number of levels in MultiIndex. Parameters ---------- - how : {'any', 'all'}, default 'any' - If the Index is a MultiIndex, drop the value when any or all levels - are NaN. - - Returns - ------- - valid : Index - """ - - @Appender(_index_shared_docs['dropna']) - def dropna(self, how='any'): - if how not in ('any', 'all'): - raise ValueError("invalid how option: {0}".format(how)) - - if self.hasnans: - return self._shallow_copy(self.values[~self._isnan]) - return self._shallow_copy() - - # -------------------------------------------------------------------- - - def _to_safe_for_reshape(self): - """ - Convert to object if we are a categorical. - """ - return self + name : label or list of labels + Name(s) to set. + inplace : boolean, default False + Modifies the object directly, instead of creating a new Index or + MultiIndex. - def _assert_can_do_setop(self, other): - if not is_list_like(other): - raise TypeError('Input must be Index or array-like') - return True + Returns + ------- + Index + The same type as the caller or None if inplace is True. - def _convert_can_do_setop(self, other): - if not isinstance(other, Index): - other = Index(other, name=self.name) - result_name = self.name - else: - result_name = get_op_result_name(self, other) - return other, result_name + See Also + -------- + Index.set_names : Able to set new names partially and by level. - def _convert_for_op(self, value): - """ - Convert value to be insertable to ndarray. - """ - return value + Examples + -------- + >>> idx = pd.Index(['A', 'C', 'A', 'B'], name='score') + >>> idx.rename('grade') + Index(['A', 'C', 'A', 'B'], dtype='object', name='grade') - def _assert_can_do_op(self, value): - """ - Check value is valid for scalar op. + >>> idx = pd.MultiIndex.from_product([['python', 'cobra'], + ... [2018, 2019]], + ... names=['kind', 'year']) + >>> idx + MultiIndex(levels=[['cobra', 'python'], [2018, 2019]], + labels=[[1, 1, 0, 0], [0, 1, 0, 1]], + names=['kind', 'year']) + >>> idx.rename(['species', 'year']) + MultiIndex(levels=[['cobra', 'python'], [2018, 2019]], + labels=[[1, 1, 0, 0], [0, 1, 0, 1]], + names=['species', 'year']) + >>> idx.rename('species') + Traceback (most recent call last): + TypeError: Must pass list-like as `names`. """ - if not is_scalar(value): - msg = "'value' must be a scalar, passed: {0}" - raise TypeError(msg.format(type(value).__name__)) + return self.set_names([name], inplace=inplace) @property def _has_complex_internals(self): @@ -2239,6 +1514,163 @@ def summary(self, name=None): "future version.", FutureWarning, stacklevel=2) return self._summary(name) + def _mpl_repr(self): + # how to represent ourselves to matplotlib + return self.values + + _na_value = np.nan + """The expected NA value to use with this index.""" + + # introspection + @property + def is_monotonic(self): + """ + Alias for is_monotonic_increasing. + """ + return self.is_monotonic_increasing + + @property + def is_monotonic_increasing(self): + """ + Return if the index is monotonic increasing (only equal or + increasing) values. + + Examples + -------- + >>> Index([1, 2, 3]).is_monotonic_increasing + True + >>> Index([1, 2, 2]).is_monotonic_increasing + True + >>> Index([1, 3, 2]).is_monotonic_increasing + False + """ + return self._engine.is_monotonic_increasing + + @property + def is_monotonic_decreasing(self): + """ + Return if the index is monotonic decreasing (only equal or + decreasing) values. + + Examples + -------- + >>> Index([3, 2, 1]).is_monotonic_decreasing + True + >>> Index([3, 2, 2]).is_monotonic_decreasing + True + >>> Index([3, 1, 2]).is_monotonic_decreasing + False + """ + return self._engine.is_monotonic_decreasing + + @property + def _is_strictly_monotonic_increasing(self): + """ + Return if the index is strictly monotonic increasing + (only increasing) values. + + Examples + -------- + >>> Index([1, 2, 3])._is_strictly_monotonic_increasing + True + >>> Index([1, 2, 2])._is_strictly_monotonic_increasing + False + >>> Index([1, 3, 2])._is_strictly_monotonic_increasing + False + """ + return self.is_unique and self.is_monotonic_increasing + + @property + def _is_strictly_monotonic_decreasing(self): + """ + Return if the index is strictly monotonic decreasing + (only decreasing) values. + + Examples + -------- + >>> Index([3, 2, 1])._is_strictly_monotonic_decreasing + True + >>> Index([3, 2, 2])._is_strictly_monotonic_decreasing + False + >>> Index([3, 1, 2])._is_strictly_monotonic_decreasing + False + """ + return self.is_unique and self.is_monotonic_decreasing + + def is_lexsorted_for_tuple(self, tup): + return True + + @cache_readonly + def is_unique(self): + """ + Return if the index has unique values. + """ + return self._engine.is_unique + + @property + def has_duplicates(self): + return not self.is_unique + + def is_boolean(self): + return self.inferred_type in ['boolean'] + + def is_integer(self): + return self.inferred_type in ['integer'] + + def is_floating(self): + return self.inferred_type in ['floating', 'mixed-integer-float'] + + def is_numeric(self): + return self.inferred_type in ['integer', 'floating'] + + def is_object(self): + return is_object_dtype(self.dtype) + + def is_categorical(self): + """ + Check if the Index holds categorical data. + + Returns + ------- + boolean + True if the Index is categorical. + + See Also + -------- + CategoricalIndex : Index for categorical data. + + Examples + -------- + >>> idx = pd.Index(["Watermelon", "Orange", "Apple", + ... "Watermelon"]).astype("category") + >>> idx.is_categorical() + True + + >>> idx = pd.Index([1, 3, 5, 7]) + >>> idx.is_categorical() + False + + >>> s = pd.Series(["Peter", "Victor", "Elisabeth", "Mar"]) + >>> s + 0 Peter + 1 Victor + 2 Elisabeth + 3 Mar + dtype: object + >>> s.index.is_categorical() + False + """ + return self.inferred_type in ['categorical'] + + def is_interval(self): + return self.inferred_type in ['interval'] + + def is_mixed(self): + return self.inferred_type in ['mixed'] + + def holds_integer(self): + return self.inferred_type in ['integer', 'mixed-integer'] + _index_shared_docs['_convert_scalar_indexer'] = """ Convert a scalar indexer. @@ -2511,21 +1943,64 @@ def get_duplicates(self): >>> pd.Index([1, 2, 3, 2, 3, 4, 3]).get_duplicates() # doctest: +SKIP [2, 3] - Return empty array-like structure when all elements are unique. + Return empty array-like structure when all elements are unique. + + >>> pd.Index([1, 2, 3, 4]).get_duplicates() # doctest: +SKIP + [] + >>> dates = pd.to_datetime(['2018-01-01', '2018-01-02', '2018-01-03'], + ... format='%Y-%m-%d') + >>> pd.Index(dates).get_duplicates() # doctest: +SKIP + DatetimeIndex([], dtype='datetime64[ns]', freq=None) + """ + warnings.warn("'get_duplicates' is deprecated and will be removed in " + "a future release. You can use " + "idx[idx.duplicated()].unique() instead", + FutureWarning, stacklevel=2) + + return self[self.duplicated()].unique() + + def _cleanup(self): + self._engine.clear_mapping() + + @cache_readonly + def _constructor(self): + return type(self) + + @cache_readonly + def _engine(self): + # property, for now, slow to look up + return self._engine_type(lambda: self._ndarray_values, len(self)) + + def _validate_index_level(self, level): + """ + Validate index level. + + For single-level Index getting level number is a no-op, but some + verification must be done like in MultiIndex. + + """ + if isinstance(level, int): + if level < 0 and level != -1: + raise IndexError("Too many levels: Index has only 1 level," + " %d is not a valid level number" % (level, )) + elif level > 0: + raise IndexError("Too many levels:" + " Index has only 1 level, not %d" % + (level + 1)) + elif level != self.name: + raise KeyError('Level %s must be same as name (%s)' % + (level, self.name)) + + def _get_level_number(self, level): + self._validate_index_level(level) + return 0 - >>> pd.Index([1, 2, 3, 4]).get_duplicates() # doctest: +SKIP - [] - >>> dates = pd.to_datetime(['2018-01-01', '2018-01-02', '2018-01-03'], - ... format='%Y-%m-%d') - >>> pd.Index(dates).get_duplicates() # doctest: +SKIP - DatetimeIndex([], dtype='datetime64[ns]', freq=None) + @cache_readonly + def inferred_type(self): """ - warnings.warn("'get_duplicates' is deprecated and will be removed in " - "a future release. You can use " - "idx[idx.duplicated()].unique() instead", - FutureWarning, stacklevel=2) - - return self[self.duplicated()].unique() + Return a string of the type inferred from the values. + """ + return lib.infer_dtype(self) def _is_memory_usage_qualified(self): """ @@ -2536,6 +2011,46 @@ def _is_memory_usage_qualified(self): def is_type_compatible(self, kind): return kind == self.inferred_type + @cache_readonly + def is_all_dates(self): + if self._data is None: + return False + return is_datetime_array(ensure_object(self.values)) + + def __reduce__(self): + d = dict(data=self._data) + d.update(self._get_attributes_dict()) + return _new_Index, (self.__class__, d), None + + def __setstate__(self, state): + """ + Necessary for making this object picklable. + """ + + if isinstance(state, dict): + self._data = state.pop('data') + for k, v in compat.iteritems(state): + setattr(self, k, v) + + elif isinstance(state, tuple): + + if len(state) == 2: + nd_state, own_state = state + data = np.empty(nd_state[1], dtype=nd_state[2]) + np.ndarray.__setstate__(data, nd_state) + self.name = own_state[0] + + else: # pragma: no cover + data = np.empty(state) + np.ndarray.__setstate__(data, state) + + self._data = data + self._reset_identity() + else: + raise Exception("invalid pickle state") + + _unpickle_compat = __setstate__ + def __nonzero__(self): raise ValueError("The truth value of a {0} is ambiguous. " "Use a.empty, a.bool(), a.item(), a.any() or a.all()." @@ -2681,6 +2196,202 @@ def _concat_same_dtype(self, to_concat, name): # must be overridden in specific classes return _concat._concat_index_asobject(to_concat, name) + _index_shared_docs['take'] = """ + Return a new %(klass)s of the values selected by the indices. + + For internal compatibility with numpy arrays. + + Parameters + ---------- + indices : list + Indices to be taken + axis : int, optional + The axis over which to select values, always 0. + allow_fill : bool, default True + fill_value : bool, default None + If allow_fill=True and fill_value is not None, indices specified by + -1 is regarded as NA. If Index doesn't hold NA, raise ValueError + + See Also + -------- + numpy.ndarray.take + """ + + @Appender(_index_shared_docs['take'] % _index_doc_kwargs) + def take(self, indices, axis=0, allow_fill=True, + fill_value=None, **kwargs): + if kwargs: + nv.validate_take(tuple(), kwargs) + indices = ensure_platform_int(indices) + if self._can_hold_na: + taken = self._assert_take_fillable(self.values, indices, + allow_fill=allow_fill, + fill_value=fill_value, + na_value=self._na_value) + else: + if allow_fill and fill_value is not None: + msg = 'Unable to fill values because {0} cannot contain NA' + raise ValueError(msg.format(self.__class__.__name__)) + taken = self.values.take(indices) + return self._shallow_copy(taken) + + def _assert_take_fillable(self, values, indices, allow_fill=True, + fill_value=None, na_value=np.nan): + """ + Internal method to handle NA filling of take. + """ + indices = ensure_platform_int(indices) + + # only fill if we are passing a non-None fill_value + if allow_fill and fill_value is not None: + if (indices < -1).any(): + msg = ('When allow_fill=True and fill_value is not None, ' + 'all indices must be >= -1') + raise ValueError(msg) + taken = algos.take(values, + indices, + allow_fill=allow_fill, + fill_value=na_value) + else: + taken = values.take(indices) + return taken + + @cache_readonly + def _isnan(self): + """ + Return if each value is NaN. + """ + if self._can_hold_na: + return isna(self) + else: + # shouldn't reach to this condition by checking hasnans beforehand + values = np.empty(len(self), dtype=np.bool_) + values.fill(False) + return values + + @cache_readonly + def _nan_idxs(self): + if self._can_hold_na: + w, = self._isnan.nonzero() + return w + else: + return np.array([], dtype=np.int64) + + @cache_readonly + def hasnans(self): + """ + Return if I have any nans; enables various perf speedups. + """ + if self._can_hold_na: + return bool(self._isnan.any()) + else: + return False + + def isna(self): + """ + Detect missing values. + + Return a boolean same-sized object indicating if the values are NA. + NA values, such as ``None``, :attr:`numpy.NaN` or :attr:`pd.NaT`, get + mapped to ``True`` values. + Everything else get mapped to ``False`` values. Characters such as + empty strings `''` or :attr:`numpy.inf` are not considered NA values + (unless you set ``pandas.options.mode.use_inf_as_na = True``). + + .. versionadded:: 0.20.0 + + Returns + ------- + numpy.ndarray + A boolean array of whether my values are NA + + See Also + -------- + pandas.Index.notna : Boolean inverse of isna. + pandas.Index.dropna : Omit entries with missing values. + pandas.isna : Top-level isna. + Series.isna : Detect missing values in Series object. + + Examples + -------- + Show which entries in a pandas.Index are NA. The result is an + array. + + >>> idx = pd.Index([5.2, 6.0, np.NaN]) + >>> idx + Float64Index([5.2, 6.0, nan], dtype='float64') + >>> idx.isna() + array([False, False, True], dtype=bool) + + Empty strings are not considered NA values. None is considered an NA + value. + + >>> idx = pd.Index(['black', '', 'red', None]) + >>> idx + Index(['black', '', 'red', None], dtype='object') + >>> idx.isna() + array([False, False, False, True], dtype=bool) + + For datetimes, `NaT` (Not a Time) is considered as an NA value. + + >>> idx = pd.DatetimeIndex([pd.Timestamp('1940-04-25'), + ... pd.Timestamp(''), None, pd.NaT]) + >>> idx + DatetimeIndex(['1940-04-25', 'NaT', 'NaT', 'NaT'], + dtype='datetime64[ns]', freq=None) + >>> idx.isna() + array([False, True, True, True], dtype=bool) + """ + return self._isnan + isnull = isna + + def notna(self): + """ + Detect existing (non-missing) values. + + Return a boolean same-sized object indicating if the values are not NA. + Non-missing values get mapped to ``True``. Characters such as empty + strings ``''`` or :attr:`numpy.inf` are not considered NA values + (unless you set ``pandas.options.mode.use_inf_as_na = True``). + NA values, such as None or :attr:`numpy.NaN`, get mapped to ``False`` + values. + + .. versionadded:: 0.20.0 + + Returns + ------- + numpy.ndarray + Boolean array to indicate which entries are not NA. + + See Also + -------- + Index.notnull : Alias of notna. + Index.isna: Inverse of notna. + pandas.notna : Top-level notna. + + Examples + -------- + Show which entries in an Index are not NA. The result is an + array. + + >>> idx = pd.Index([5.2, 6.0, np.NaN]) + >>> idx + Float64Index([5.2, 6.0, nan], dtype='float64') + >>> idx.notna() + array([ True, True, False]) + + Empty strings are not considered NA values. None is considered a NA + value. + + >>> idx = pd.Index(['black', '', 'red', None]) + >>> idx + Index(['black', '', 'red', None], dtype='object') + >>> idx.notna() + array([ True, True, True, False]) + """ + return ~self.isna() + notnull = notna + def putmask(self, mask, value): """ Return a new Index of the values set with the mask. @@ -2700,6 +2411,86 @@ def putmask(self, mask, value): # coerces to object return self.astype(object).putmask(mask, value) + def format(self, name=False, formatter=None, **kwargs): + """ + Render a string representation of the Index. + """ + header = [] + if name: + header.append(pprint_thing(self.name, + escape_chars=('\t', '\r', '\n')) if + self.name is not None else '') + + if formatter is not None: + return header + list(self.map(formatter)) + + return self._format_with_header(header, **kwargs) + + def _format_with_header(self, header, na_rep='NaN', **kwargs): + values = self.values + + from pandas.io.formats.format import format_array + + if is_categorical_dtype(values.dtype): + values = np.array(values) + + elif is_object_dtype(values.dtype): + values = lib.maybe_convert_objects(values, safe=1) + + if is_object_dtype(values.dtype): + result = [pprint_thing(x, escape_chars=('\t', '\r', '\n')) + for x in values] + + # could have nans + mask = isna(values) + if mask.any(): + result = np.array(result) + result[mask] = na_rep + result = result.tolist() + + else: + result = _trim_front(format_array(values, None, justify='left')) + return header + result + + def to_native_types(self, slicer=None, **kwargs): + """ + Format specified values of `self` and return them. + + Parameters + ---------- + slicer : int, array-like + An indexer into `self` that specifies which values + are used in the formatting process. + kwargs : dict + Options for specifying how the values should be formatted. + These options include the following: + + 1) na_rep : str + The value that serves as a placeholder for NULL values + 2) quoting : bool or None + Whether or not there are quoted values in `self` + 3) date_format : str + The format used to represent date-like values + """ + + values = self + if slicer is not None: + values = values[slicer] + return values._format_native_types(**kwargs) + + def _format_native_types(self, na_rep='', quoting=None, **kwargs): + """ + Actually format specific types of the index. + """ + mask = isna(self) + if not self.is_object() and not quoting: + values = np.asarray(self).astype(str) + else: + values = np.array(self, dtype=object, copy=True) + + values[mask] = na_rep + return values + def equals(self, other): """ Determines if two Index objects contain the same elements. @@ -2892,6 +2683,25 @@ def sort(self, *args, **kwargs): raise TypeError("cannot sort an Index object in-place, use " "sort_values instead") + def sortlevel(self, level=None, ascending=True, sort_remaining=None): + """ + For internal compatibility with with the Index API. + + Sort the Index. This is for compat with MultiIndex + + Parameters + ---------- + ascending : boolean, default True + False to sort in descending order + + level, sort_remaining are compat parameters + + Returns + ------- + sorted_index : Index + """ + return self.sort_values(return_indexer=True, ascending=ascending) + def shift(self, periods=1, freq=None): """ Shift index by desired number of time frequency increments. @@ -3018,9 +2828,6 @@ def __or__(self, other): def __xor__(self, other): return self.symmetric_difference(other) - # -------------------------------------------------------------------- - # Set Operation Methods - def _get_reconciled_name_object(self, other): """ If the result of a set operation will be self, @@ -3310,8 +3117,6 @@ def symmetric_difference(self, other, result_name=None): attribs['freq'] = None return self._shallow_copy_with_infer(the_diff, **attribs) - # -------------------------------------------------------------------- - def _get_unique_index(self, dropna=False): """ Returns an index containing unique values. @@ -3470,6 +3275,104 @@ def set_value(self, arr, key, value): self._engine.set_value(com.values_from_object(arr), com.values_from_object(key), value) + def _get_level_values(self, level): + """ + Return an Index of values for requested level. + + This is primarily useful to get an individual level of values from a + MultiIndex, but is provided on Index as well for compatability. + + Parameters + ---------- + level : int or str + It is either the integer position or the name of the level. + + Returns + ------- + values : Index + Calling object, as there is only one level in the Index. + + See Also + -------- + MultiIndex.get_level_values : Get values for a level of a MultiIndex. + + Notes + ----- + For Index, level should be 0, since there are no multiple levels. + + Examples + -------- + + >>> idx = pd.Index(list('abc')) + >>> idx + Index(['a', 'b', 'c'], dtype='object') + + Get level values by supplying `level` as integer: + + >>> idx.get_level_values(0) + Index(['a', 'b', 'c'], dtype='object') + """ + self._validate_index_level(level) + return self + + get_level_values = _get_level_values + + def droplevel(self, level=0): + """ + Return index with requested level(s) removed. + + If resulting index has only 1 level left, the result will be + of Index type, not MultiIndex. + + .. versionadded:: 0.23.1 (support for non-MultiIndex) + + Parameters + ---------- + level : int, str, or list-like, default 0 + If a string is given, must be the name of a level + If list-like, elements must be names or indexes of levels. + + Returns + ------- + index : Index or MultiIndex + """ + if not isinstance(level, (tuple, list)): + level = [level] + + levnums = sorted(self._get_level_number(lev) for lev in level)[::-1] + + if len(level) == 0: + return self + if len(level) >= self.nlevels: + raise ValueError("Cannot remove {} levels from an index with {} " + "levels: at least one level must be " + "left.".format(len(level), self.nlevels)) + # The two checks above guarantee that here self is a MultiIndex + + new_levels = list(self.levels) + new_labels = list(self.labels) + new_names = list(self.names) + + for i in levnums: + new_levels.pop(i) + new_labels.pop(i) + new_names.pop(i) + + if len(new_levels) == 1: + + # set nan if needed + mask = new_labels[0] == -1 + result = new_levels[0].take(new_labels[0]) + if mask.any(): + result = result.putmask(mask, np.nan) + + result.name = new_names[0] + return result + else: + from .multi import MultiIndex + return MultiIndex(levels=new_levels, labels=new_labels, + names=new_names, verify_integrity=False) + _index_shared_docs['get_indexer'] = """ Compute indexer and mask for new index given the current index. The indexer should be then used as an input to ndarray.take to align the @@ -3994,9 +3897,6 @@ def _reindex_non_unique(self, target): new_index = self._shallow_copy_with_infer(new_labels, freq=None) return new_index, indexer, new_indexer - # -------------------------------------------------------------------- - # Join Methods - _index_shared_docs['join'] = """ Compute join_index and indexers to conform data structures to the new index. @@ -4389,8 +4289,6 @@ def _wrap_joined_index(self, joined, other): name = get_op_result_name(self, other) return Index(joined, name=name) - # -------------------------------------------------------------------- - def _get_string_slice(self, key, use_lhs=True, use_rhs=True): # this is for partial string indexing, # overridden in DatetimeIndex, TimedeltaIndex and PeriodIndex @@ -4865,6 +4763,58 @@ def duplicated(self, keep='first'): """ return super(Index, self).duplicated(keep=keep) + _index_shared_docs['fillna'] = """ + Fill NA/NaN values with the specified value + + Parameters + ---------- + value : scalar + Scalar value to use to fill holes (e.g. 0). + This value cannot be a list-likes. + downcast : dict, default is None + a dict of item->dtype of what to downcast if possible, + or the string 'infer' which will try to downcast to an appropriate + equal type (e.g. float64 to int64 if possible) + + Returns + ------- + filled : %(klass)s + """ + + @Appender(_index_shared_docs['fillna']) + def fillna(self, value=None, downcast=None): + self._assert_can_do_op(value) + if self.hasnans: + result = self.putmask(self._isnan, value) + if downcast is None: + # no need to care metadata other than name + # because it can't have freq if + return Index(result, name=self.name) + return self._shallow_copy() + + _index_shared_docs['dropna'] = """ + Return Index without NA/NaN values + + Parameters + ---------- + how : {'any', 'all'}, default 'any' + If the Index is a MultiIndex, drop the value when any or all levels + are NaN. + + Returns + ------- + valid : Index + """ + + @Appender(_index_shared_docs['dropna']) + def dropna(self, how='any'): + if how not in ('any', 'all'): + raise ValueError("invalid how option: {0}".format(how)) + + if self.hasnans: + return self._shallow_copy(self.values[~self._isnan]) + return self._shallow_copy() + def _evaluate_with_timedelta_like(self, other, op): # Timedelta knows how to operate with np.array, so dispatch to that # operation and then wrap the results From fd75395a2d2a82048d34bc230e64f534488241f6 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 27 Nov 2018 16:57:47 -0800 Subject: [PATCH 05/10] reimplement bits --- pandas/core/indexes/base.py | 979 +++++++++++++++++++----------------- 1 file changed, 505 insertions(+), 474 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 22c348acaf341..a622e328b7d4e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -244,6 +244,9 @@ def _outer_indexer(self, left, right): str = CachedAccessor("str", StringMethods) + # -------------------------------------------------------------------- + # Constructors + def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=None, tupleize_cols=True, **kwargs): @@ -518,6 +521,12 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): setattr(result, k, v) return result._reset_identity() + @cache_readonly + def _constructor(self): + return type(self) + + # -------------------------------------------------------------------- + _index_shared_docs['_shallow_copy'] = """ Create a new Index with the same class as the caller, don't copy the data, use the same object attributes with passed in attributes taking @@ -608,12 +617,6 @@ def _update_inplace(self, result, **kwargs): # guard when called from IndexOpsMixin raise TypeError("Index can't be updated inplace") - def _sort_levels_monotonic(self): - """ - Compat with MultiIndex. - """ - return self - _index_shared_docs['_get_grouper_for_level'] = """ Get index grouper corresponding to an index level @@ -671,6 +674,9 @@ def _reset_identity(self): self._id = _Identity() return self + # -------------------------------------------------------------------- + # Array-Like Methods + # ndarray compat def __len__(self): """ @@ -709,6 +715,131 @@ def dtype_str(self): """ return str(self.dtype) + def ravel(self, order='C'): + """ + Return an ndarray of the flattened values of the underlying data. + + See Also + -------- + numpy.ndarray.ravel + """ + return self._ndarray_values.ravel(order=order) + + def view(self, cls=None): + + # we need to see if we are subclassing an + # index type here + if cls is not None and not hasattr(cls, '_typ'): + result = self._data.view(cls) + else: + result = self._shallow_copy() + if isinstance(result, Index): + result._id = self._id + return result + + _index_shared_docs['astype'] = """ + Create an Index with values cast to dtypes. The class of a new Index + is determined by dtype. When conversion is impossible, a ValueError + exception is raised. + + Parameters + ---------- + dtype : numpy dtype or pandas type + copy : bool, default True + By default, astype always returns a newly allocated object. + If copy is set to False and internal requirements on dtype are + satisfied, the original data is used to create a new Index + or the original Index is returned. + + .. versionadded:: 0.19.0 + """ + + @Appender(_index_shared_docs['astype']) + def astype(self, dtype, copy=True): + if is_dtype_equal(self.dtype, dtype): + return self.copy() if copy else self + + elif is_categorical_dtype(dtype): + from .category import CategoricalIndex + return CategoricalIndex(self.values, name=self.name, dtype=dtype, + copy=copy) + + elif is_extension_array_dtype(dtype): + return Index(np.asarray(self), dtype=dtype, copy=copy) + + try: + if is_datetime64tz_dtype(dtype): + from pandas import DatetimeIndex + return DatetimeIndex(self.values, name=self.name, dtype=dtype, + copy=copy) + return Index(self.values.astype(dtype, copy=copy), name=self.name, + dtype=dtype) + except (TypeError, ValueError): + msg = 'Cannot cast {name} to dtype {dtype}' + raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) + + _index_shared_docs['take'] = """ + Return a new %(klass)s of the values selected by the indices. + + For internal compatibility with numpy arrays. + + Parameters + ---------- + indices : list + Indices to be taken + axis : int, optional + The axis over which to select values, always 0. + allow_fill : bool, default True + fill_value : bool, default None + If allow_fill=True and fill_value is not None, indices specified by + -1 is regarded as NA. If Index doesn't hold NA, raise ValueError + + See Also + -------- + numpy.ndarray.take + """ + + @Appender(_index_shared_docs['take'] % _index_doc_kwargs) + def take(self, indices, axis=0, allow_fill=True, + fill_value=None, **kwargs): + if kwargs: + nv.validate_take(tuple(), kwargs) + indices = ensure_platform_int(indices) + if self._can_hold_na: + taken = self._assert_take_fillable(self.values, indices, + allow_fill=allow_fill, + fill_value=fill_value, + na_value=self._na_value) + else: + if allow_fill and fill_value is not None: + msg = 'Unable to fill values because {0} cannot contain NA' + raise ValueError(msg.format(self.__class__.__name__)) + taken = self.values.take(indices) + return self._shallow_copy(taken) + + def _assert_take_fillable(self, values, indices, allow_fill=True, + fill_value=None, na_value=np.nan): + """ + Internal method to handle NA filling of take. + """ + indices = ensure_platform_int(indices) + + # only fill if we are passing a non-None fill_value + if allow_fill and fill_value is not None: + if (indices < -1).any(): + msg = ('When allow_fill=True and fill_value is not None, ' + 'all indices must be >= -1') + raise ValueError(msg) + taken = algos.take(values, + indices, + allow_fill=allow_fill, + fill_value=na_value) + else: + taken = values.take(indices) + return taken + + # -------------------------------------------------------------------- + @property def values(self): """ @@ -874,16 +1005,6 @@ def where(self, cond, other=None): return self._shallow_copy_with_infer(values, dtype=dtype) - def ravel(self, order='C'): - """ - Return an ndarray of the flattened values of the underlying data. - - See Also - -------- - numpy.ndarray.ravel - """ - return self._ndarray_values.ravel(order=order) - # construction helpers @classmethod def _try_convert_to_int_index(cls, data, copy, name, dtype): @@ -969,18 +1090,6 @@ def _get_attributes_dict(self): """ return {k: getattr(self, k, None) for k in self._attributes} - def view(self, cls=None): - - # we need to see if we are subclassing an - # index type here - if cls is not None and not hasattr(cls, '_typ'): - result = self._data.view(cls) - else: - result = self._shallow_copy() - if isinstance(result, Index): - result._id = self._id - return result - def _coerce_scalar_to_index(self, item): """ We need to coerce a scalar to a compat for our index type. @@ -998,6 +1107,9 @@ def _coerce_scalar_to_index(self, item): return Index([item], dtype=dtype, **self._get_attributes_dict()) + # -------------------------------------------------------------------- + # Copying Methods + _index_shared_docs['copy'] = """ Make a copy of this object. Name and dtype sets those attributes on the new object. @@ -1047,24 +1159,8 @@ def __deepcopy__(self, memo=None): memo = {} return self.copy(deep=True) - def _validate_names(self, name=None, names=None, deep=False): - """ - Handles the quirks of having a singular 'name' parameter for general - Index and plural 'names' parameter for MultiIndex. - """ - from copy import deepcopy - if names is not None and name is not None: - raise TypeError("Can only provide one of `names` and `name`") - elif names is None and name is None: - return deepcopy(self.names) if deep else self.names - elif names is not None: - if not is_list_like(names): - raise TypeError("Must pass list-like as `names`.") - return names - else: - if not is_list_like(name): - return [name] - return name + # -------------------------------------------------------------------- + # Rendering Methods def __unicode__(self): """ @@ -1125,63 +1221,150 @@ def _format_attrs(self): """ return format_object_attrs(self) - def to_flat_index(self): - """ - Identity method. + def _mpl_repr(self): + # how to represent ourselves to matplotlib + return self.values - .. versionadded:: 0.24.0 + def format(self, name=False, formatter=None, **kwargs): + """ + Render a string representation of the Index. + """ + header = [] + if name: + header.append(pprint_thing(self.name, + escape_chars=('\t', '\r', '\n')) if + self.name is not None else '') - This is implemented for compatability with subclass implementations - when chaining. + if formatter is not None: + return header + list(self.map(formatter)) - Returns - ------- - pd.Index - Caller. + return self._format_with_header(header, **kwargs) - See Also - -------- - MultiIndex.to_flat_index : Subclass implementation. - """ - return self + def _format_with_header(self, header, na_rep='NaN', **kwargs): + values = self.values - def to_series(self, index=None, name=None): - """ - Create a Series with both index and values equal to the index keys - useful with map for returning an indexer based on an index. + from pandas.io.formats.format import format_array - Parameters - ---------- - index : Index, optional - index of resulting Series. If None, defaults to original index - name : string, optional - name of resulting Series. If None, defaults to name of original - index + if is_categorical_dtype(values.dtype): + values = np.array(values) - Returns - ------- - Series : dtype will be based on the type of the Index values. - """ + elif is_object_dtype(values.dtype): + values = lib.maybe_convert_objects(values, safe=1) - from pandas import Series + if is_object_dtype(values.dtype): + result = [pprint_thing(x, escape_chars=('\t', '\r', '\n')) + for x in values] - if index is None: - index = self._shallow_copy() - if name is None: - name = self.name + # could have nans + mask = isna(values) + if mask.any(): + result = np.array(result) + result[mask] = na_rep + result = result.tolist() - return Series(self.values.copy(), index=index, name=name) + else: + result = _trim_front(format_array(values, None, justify='left')) + return header + result - def to_frame(self, index=True, name=None): + def to_native_types(self, slicer=None, **kwargs): """ - Create a DataFrame with a column containing the Index. - - .. versionadded:: 0.24.0 + Format specified values of `self` and return them. Parameters ---------- - index : boolean, default True - Set the index of the returned DataFrame as the original Index. + slicer : int, array-like + An indexer into `self` that specifies which values + are used in the formatting process. + kwargs : dict + Options for specifying how the values should be formatted. + These options include the following: + + 1) na_rep : str + The value that serves as a placeholder for NULL values + 2) quoting : bool or None + Whether or not there are quoted values in `self` + 3) date_format : str + The format used to represent date-like values + """ + + values = self + if slicer is not None: + values = values[slicer] + return values._format_native_types(**kwargs) + + def _format_native_types(self, na_rep='', quoting=None, **kwargs): + """ + Actually format specific types of the index. + """ + mask = isna(self) + if not self.is_object() and not quoting: + values = np.asarray(self).astype(str) + else: + values = np.array(self, dtype=object, copy=True) + + values[mask] = na_rep + return values + + # -------------------------------------------------------------------- + # Conversion Methods + + def to_flat_index(self): + """ + Identity method. + + .. versionadded:: 0.24.0 + + This is implemented for compatability with subclass implementations + when chaining. + + Returns + ------- + pd.Index + Caller. + + See Also + -------- + MultiIndex.to_flat_index : Subclass implementation. + """ + return self + + def to_series(self, index=None, name=None): + """ + Create a Series with both index and values equal to the index keys + useful with map for returning an indexer based on an index. + + Parameters + ---------- + index : Index, optional + index of resulting Series. If None, defaults to original index + name : string, optional + name of resulting Series. If None, defaults to name of original + index + + Returns + ------- + Series : dtype will be based on the type of the Index values. + """ + + from pandas import Series + + if index is None: + index = self._shallow_copy() + if name is None: + name = self.name + + return Series(self.values.copy(), index=index, name=name) + + def to_frame(self, index=True, name=None): + """ + Create a DataFrame with a column containing the Index. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + index : boolean, default True + Set the index of the returned DataFrame as the original Index. name : object, default None The passed name should substitute for the index name (if it has @@ -1233,83 +1416,27 @@ def to_frame(self, index=True, name=None): result.index = self return result - _index_shared_docs['astype'] = """ - Create an Index with values cast to dtypes. The class of a new Index - is determined by dtype. When conversion is impossible, a ValueError - exception is raised. - - Parameters - ---------- - dtype : numpy dtype or pandas type - copy : bool, default True - By default, astype always returns a newly allocated object. - If copy is set to False and internal requirements on dtype are - satisfied, the original data is used to create a new Index - or the original Index is returned. - - .. versionadded:: 0.19.0 - """ - - @Appender(_index_shared_docs['astype']) - def astype(self, dtype, copy=True): - if is_dtype_equal(self.dtype, dtype): - return self.copy() if copy else self + # -------------------------------------------------------------------- + # Name-Centric Methods - elif is_categorical_dtype(dtype): - from .category import CategoricalIndex - return CategoricalIndex(self.values, name=self.name, dtype=dtype, - copy=copy) - - elif is_extension_array_dtype(dtype): - return Index(np.asarray(self), dtype=dtype, copy=copy) - - try: - if is_datetime64tz_dtype(dtype): - from pandas import DatetimeIndex - return DatetimeIndex(self.values, name=self.name, dtype=dtype, - copy=copy) - return Index(self.values.astype(dtype, copy=copy), name=self.name, - dtype=dtype) - except (TypeError, ValueError): - msg = 'Cannot cast {name} to dtype {dtype}' - raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) - - def _to_safe_for_reshape(self): + def _validate_names(self, name=None, names=None, deep=False): """ - Convert to object if we are a categorical. + Handles the quirks of having a singular 'name' parameter for general + Index and plural 'names' parameter for MultiIndex. """ - return self - - def _assert_can_do_setop(self, other): - if not is_list_like(other): - raise TypeError('Input must be Index or array-like') - return True - - def _convert_can_do_setop(self, other): - if not isinstance(other, Index): - other = Index(other, name=self.name) - result_name = self.name + from copy import deepcopy + if names is not None and name is not None: + raise TypeError("Can only provide one of `names` and `name`") + elif names is None and name is None: + return deepcopy(self.names) if deep else self.names + elif names is not None: + if not is_list_like(names): + raise TypeError("Must pass list-like as `names`.") + return names else: - result_name = get_op_result_name(self, other) - return other, result_name - - def _convert_for_op(self, value): - """ - Convert value to be insertable to ndarray. - """ - return value - - def _assert_can_do_op(self, value): - """ - Check value is valid for scalar op. - """ - if not is_scalar(value): - msg = "'value' must be a scalar, passed: {0}" - raise TypeError(msg.format(type(value).__name__)) - - @property - def nlevels(self): - return 1 + if not is_list_like(name): + return [name] + return name def _get_names(self): return FrozenList((self.name, )) @@ -1468,6 +1595,195 @@ def rename(self, name, inplace=False): """ return self.set_names([name], inplace=inplace) + # -------------------------------------------------------------------- + # Level-Centric Methods + + @property + def nlevels(self): + return 1 + + def _sort_levels_monotonic(self): + """ + Compat with MultiIndex. + """ + return self + + def _validate_index_level(self, level): + """ + Validate index level. + + For single-level Index getting level number is a no-op, but some + verification must be done like in MultiIndex. + + """ + if isinstance(level, int): + if level < 0 and level != -1: + raise IndexError("Too many levels: Index has only 1 level," + " %d is not a valid level number" % (level, )) + elif level > 0: + raise IndexError("Too many levels:" + " Index has only 1 level, not %d" % + (level + 1)) + elif level != self.name: + raise KeyError('Level %s must be same as name (%s)' % + (level, self.name)) + + def _get_level_number(self, level): + self._validate_index_level(level) + return 0 + + def sortlevel(self, level=None, ascending=True, sort_remaining=None): + """ + For internal compatibility with with the Index API. + + Sort the Index. This is for compat with MultiIndex + + Parameters + ---------- + ascending : boolean, default True + False to sort in descending order + + level, sort_remaining are compat parameters + + Returns + ------- + sorted_index : Index + """ + return self.sort_values(return_indexer=True, ascending=ascending) + + def _get_level_values(self, level): + """ + Return an Index of values for requested level. + + This is primarily useful to get an individual level of values from a + MultiIndex, but is provided on Index as well for compatability. + + Parameters + ---------- + level : int or str + It is either the integer position or the name of the level. + + Returns + ------- + values : Index + Calling object, as there is only one level in the Index. + + See Also + -------- + MultiIndex.get_level_values : Get values for a level of a MultiIndex. + + Notes + ----- + For Index, level should be 0, since there are no multiple levels. + + Examples + -------- + + >>> idx = pd.Index(list('abc')) + >>> idx + Index(['a', 'b', 'c'], dtype='object') + + Get level values by supplying `level` as integer: + + >>> idx.get_level_values(0) + Index(['a', 'b', 'c'], dtype='object') + """ + self._validate_index_level(level) + return self + + get_level_values = _get_level_values + + def droplevel(self, level=0): + """ + Return index with requested level(s) removed. + + If resulting index has only 1 level left, the result will be + of Index type, not MultiIndex. + + .. versionadded:: 0.23.1 (support for non-MultiIndex) + + Parameters + ---------- + level : int, str, or list-like, default 0 + If a string is given, must be the name of a level + If list-like, elements must be names or indexes of levels. + + Returns + ------- + index : Index or MultiIndex + """ + if not isinstance(level, (tuple, list)): + level = [level] + + levnums = sorted(self._get_level_number(lev) for lev in level)[::-1] + + if len(level) == 0: + return self + if len(level) >= self.nlevels: + raise ValueError("Cannot remove {} levels from an index with {} " + "levels: at least one level must be " + "left.".format(len(level), self.nlevels)) + # The two checks above guarantee that here self is a MultiIndex + + new_levels = list(self.levels) + new_labels = list(self.labels) + new_names = list(self.names) + + for i in levnums: + new_levels.pop(i) + new_labels.pop(i) + new_names.pop(i) + + if len(new_levels) == 1: + + # set nan if needed + mask = new_labels[0] == -1 + result = new_levels[0].take(new_labels[0]) + if mask.any(): + result = result.putmask(mask, np.nan) + + result.name = new_names[0] + return result + else: + from .multi import MultiIndex + return MultiIndex(levels=new_levels, labels=new_labels, + names=new_names, verify_integrity=False) + + # -------------------------------------------------------------------- + + def _to_safe_for_reshape(self): + """ + Convert to object if we are a categorical. + """ + return self + + def _assert_can_do_setop(self, other): + if not is_list_like(other): + raise TypeError('Input must be Index or array-like') + return True + + def _convert_can_do_setop(self, other): + if not isinstance(other, Index): + other = Index(other, name=self.name) + result_name = self.name + else: + result_name = get_op_result_name(self, other) + return other, result_name + + def _convert_for_op(self, value): + """ + Convert value to be insertable to ndarray. + """ + return value + + def _assert_can_do_op(self, value): + """ + Check value is valid for scalar op. + """ + if not is_scalar(value): + msg = "'value' must be a scalar, passed: {0}" + raise TypeError(msg.format(type(value).__name__)) + @property def _has_complex_internals(self): # to disable groupby tricks in MultiIndex @@ -1514,14 +1830,12 @@ def summary(self, name=None): "future version.", FutureWarning, stacklevel=2) return self._summary(name) - def _mpl_repr(self): - # how to represent ourselves to matplotlib - return self.values - _na_value = np.nan """The expected NA value to use with this index.""" - # introspection + # -------------------------------------------------------------------- + # Introspection Methods + @property def is_monotonic(self): """ @@ -1671,6 +1985,21 @@ def is_mixed(self): def holds_integer(self): return self.inferred_type in ['integer', 'mixed-integer'] + @cache_readonly + def inferred_type(self): + """ + Return a string of the type inferred from the values. + """ + return lib.infer_dtype(self) + + @cache_readonly + def is_all_dates(self): + if self._data is None: + return False + return is_datetime_array(ensure_object(self.values)) + + # -------------------------------------------------------------------- + _index_shared_docs['_convert_scalar_indexer'] = """ Convert a scalar indexer. @@ -1962,46 +2291,11 @@ def get_duplicates(self): def _cleanup(self): self._engine.clear_mapping() - @cache_readonly - def _constructor(self): - return type(self) - @cache_readonly def _engine(self): # property, for now, slow to look up return self._engine_type(lambda: self._ndarray_values, len(self)) - def _validate_index_level(self, level): - """ - Validate index level. - - For single-level Index getting level number is a no-op, but some - verification must be done like in MultiIndex. - - """ - if isinstance(level, int): - if level < 0 and level != -1: - raise IndexError("Too many levels: Index has only 1 level," - " %d is not a valid level number" % (level, )) - elif level > 0: - raise IndexError("Too many levels:" - " Index has only 1 level, not %d" % - (level + 1)) - elif level != self.name: - raise KeyError('Level %s must be same as name (%s)' % - (level, self.name)) - - def _get_level_number(self, level): - self._validate_index_level(level) - return 0 - - @cache_readonly - def inferred_type(self): - """ - Return a string of the type inferred from the values. - """ - return lib.infer_dtype(self) - def _is_memory_usage_qualified(self): """ Return a boolean if we need a qualified .info display. @@ -2011,12 +2305,6 @@ def _is_memory_usage_qualified(self): def is_type_compatible(self, kind): return kind == self.inferred_type - @cache_readonly - def is_all_dates(self): - if self._data is None: - return False - return is_datetime_array(ensure_object(self.values)) - def __reduce__(self): d = dict(data=self._data) d.update(self._get_attributes_dict()) @@ -2179,82 +2467,22 @@ def append(self, other): names = {obj.name for obj in to_concat} name = None if len(names) > 1 else self.name - return self._concat(to_concat, name) - - def _concat(self, to_concat, name): - - typs = _concat.get_dtype_kinds(to_concat) - - if len(typs) == 1: - return self._concat_same_dtype(to_concat, name=name) - return _concat._concat_index_asobject(to_concat, name=name) - - def _concat_same_dtype(self, to_concat, name): - """ - Concatenate to_concat which has the same class. - """ - # must be overridden in specific classes - return _concat._concat_index_asobject(to_concat, name) - - _index_shared_docs['take'] = """ - Return a new %(klass)s of the values selected by the indices. - - For internal compatibility with numpy arrays. + return self._concat(to_concat, name) - Parameters - ---------- - indices : list - Indices to be taken - axis : int, optional - The axis over which to select values, always 0. - allow_fill : bool, default True - fill_value : bool, default None - If allow_fill=True and fill_value is not None, indices specified by - -1 is regarded as NA. If Index doesn't hold NA, raise ValueError + def _concat(self, to_concat, name): - See Also - -------- - numpy.ndarray.take - """ + typs = _concat.get_dtype_kinds(to_concat) - @Appender(_index_shared_docs['take'] % _index_doc_kwargs) - def take(self, indices, axis=0, allow_fill=True, - fill_value=None, **kwargs): - if kwargs: - nv.validate_take(tuple(), kwargs) - indices = ensure_platform_int(indices) - if self._can_hold_na: - taken = self._assert_take_fillable(self.values, indices, - allow_fill=allow_fill, - fill_value=fill_value, - na_value=self._na_value) - else: - if allow_fill and fill_value is not None: - msg = 'Unable to fill values because {0} cannot contain NA' - raise ValueError(msg.format(self.__class__.__name__)) - taken = self.values.take(indices) - return self._shallow_copy(taken) + if len(typs) == 1: + return self._concat_same_dtype(to_concat, name=name) + return _concat._concat_index_asobject(to_concat, name=name) - def _assert_take_fillable(self, values, indices, allow_fill=True, - fill_value=None, na_value=np.nan): + def _concat_same_dtype(self, to_concat, name): """ - Internal method to handle NA filling of take. + Concatenate to_concat which has the same class. """ - indices = ensure_platform_int(indices) - - # only fill if we are passing a non-None fill_value - if allow_fill and fill_value is not None: - if (indices < -1).any(): - msg = ('When allow_fill=True and fill_value is not None, ' - 'all indices must be >= -1') - raise ValueError(msg) - taken = algos.take(values, - indices, - allow_fill=allow_fill, - fill_value=na_value) - else: - taken = values.take(indices) - return taken + # must be overridden in specific classes + return _concat._concat_index_asobject(to_concat, name) @cache_readonly def _isnan(self): @@ -2411,86 +2639,6 @@ def putmask(self, mask, value): # coerces to object return self.astype(object).putmask(mask, value) - def format(self, name=False, formatter=None, **kwargs): - """ - Render a string representation of the Index. - """ - header = [] - if name: - header.append(pprint_thing(self.name, - escape_chars=('\t', '\r', '\n')) if - self.name is not None else '') - - if formatter is not None: - return header + list(self.map(formatter)) - - return self._format_with_header(header, **kwargs) - - def _format_with_header(self, header, na_rep='NaN', **kwargs): - values = self.values - - from pandas.io.formats.format import format_array - - if is_categorical_dtype(values.dtype): - values = np.array(values) - - elif is_object_dtype(values.dtype): - values = lib.maybe_convert_objects(values, safe=1) - - if is_object_dtype(values.dtype): - result = [pprint_thing(x, escape_chars=('\t', '\r', '\n')) - for x in values] - - # could have nans - mask = isna(values) - if mask.any(): - result = np.array(result) - result[mask] = na_rep - result = result.tolist() - - else: - result = _trim_front(format_array(values, None, justify='left')) - return header + result - - def to_native_types(self, slicer=None, **kwargs): - """ - Format specified values of `self` and return them. - - Parameters - ---------- - slicer : int, array-like - An indexer into `self` that specifies which values - are used in the formatting process. - kwargs : dict - Options for specifying how the values should be formatted. - These options include the following: - - 1) na_rep : str - The value that serves as a placeholder for NULL values - 2) quoting : bool or None - Whether or not there are quoted values in `self` - 3) date_format : str - The format used to represent date-like values - """ - - values = self - if slicer is not None: - values = values[slicer] - return values._format_native_types(**kwargs) - - def _format_native_types(self, na_rep='', quoting=None, **kwargs): - """ - Actually format specific types of the index. - """ - mask = isna(self) - if not self.is_object() and not quoting: - values = np.asarray(self).astype(str) - else: - values = np.array(self, dtype=object, copy=True) - - values[mask] = na_rep - return values - def equals(self, other): """ Determines if two Index objects contain the same elements. @@ -2683,25 +2831,6 @@ def sort(self, *args, **kwargs): raise TypeError("cannot sort an Index object in-place, use " "sort_values instead") - def sortlevel(self, level=None, ascending=True, sort_remaining=None): - """ - For internal compatibility with with the Index API. - - Sort the Index. This is for compat with MultiIndex - - Parameters - ---------- - ascending : boolean, default True - False to sort in descending order - - level, sort_remaining are compat parameters - - Returns - ------- - sorted_index : Index - """ - return self.sort_values(return_indexer=True, ascending=ascending) - def shift(self, periods=1, freq=None): """ Shift index by desired number of time frequency increments. @@ -3275,104 +3404,6 @@ def set_value(self, arr, key, value): self._engine.set_value(com.values_from_object(arr), com.values_from_object(key), value) - def _get_level_values(self, level): - """ - Return an Index of values for requested level. - - This is primarily useful to get an individual level of values from a - MultiIndex, but is provided on Index as well for compatability. - - Parameters - ---------- - level : int or str - It is either the integer position or the name of the level. - - Returns - ------- - values : Index - Calling object, as there is only one level in the Index. - - See Also - -------- - MultiIndex.get_level_values : Get values for a level of a MultiIndex. - - Notes - ----- - For Index, level should be 0, since there are no multiple levels. - - Examples - -------- - - >>> idx = pd.Index(list('abc')) - >>> idx - Index(['a', 'b', 'c'], dtype='object') - - Get level values by supplying `level` as integer: - - >>> idx.get_level_values(0) - Index(['a', 'b', 'c'], dtype='object') - """ - self._validate_index_level(level) - return self - - get_level_values = _get_level_values - - def droplevel(self, level=0): - """ - Return index with requested level(s) removed. - - If resulting index has only 1 level left, the result will be - of Index type, not MultiIndex. - - .. versionadded:: 0.23.1 (support for non-MultiIndex) - - Parameters - ---------- - level : int, str, or list-like, default 0 - If a string is given, must be the name of a level - If list-like, elements must be names or indexes of levels. - - Returns - ------- - index : Index or MultiIndex - """ - if not isinstance(level, (tuple, list)): - level = [level] - - levnums = sorted(self._get_level_number(lev) for lev in level)[::-1] - - if len(level) == 0: - return self - if len(level) >= self.nlevels: - raise ValueError("Cannot remove {} levels from an index with {} " - "levels: at least one level must be " - "left.".format(len(level), self.nlevels)) - # The two checks above guarantee that here self is a MultiIndex - - new_levels = list(self.levels) - new_labels = list(self.labels) - new_names = list(self.names) - - for i in levnums: - new_levels.pop(i) - new_labels.pop(i) - new_names.pop(i) - - if len(new_levels) == 1: - - # set nan if needed - mask = new_labels[0] == -1 - result = new_levels[0].take(new_labels[0]) - if mask.any(): - result = result.putmask(mask, np.nan) - - result.name = new_names[0] - return result - else: - from .multi import MultiIndex - return MultiIndex(levels=new_levels, labels=new_labels, - names=new_names, verify_integrity=False) - _index_shared_docs['get_indexer'] = """ Compute indexer and mask for new index given the current index. The indexer should be then used as an input to ndarray.take to align the From 32d389e034163188d51fa973d8bff46cfc014cbc Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 27 Nov 2018 17:07:27 -0800 Subject: [PATCH 06/10] Collect Index methods by purpose --- pandas/core/indexes/category.py | 10 + pandas/core/indexes/datetimelike.py | 11 +- pandas/core/indexes/datetimes.py | 352 +++++++++++---------- pandas/core/indexes/interval.py | 93 +++--- pandas/core/indexes/multi.py | 474 ++++++++++++++-------------- pandas/core/indexes/period.py | 23 +- pandas/core/indexes/range.py | 11 +- pandas/core/indexes/timedeltas.py | 16 +- 8 files changed, 526 insertions(+), 464 deletions(-) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index f05b0fdd4a323..6b84e8deea493 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -94,6 +94,9 @@ def _engine_type(self): _attributes = ['name'] + # -------------------------------------------------------------------- + # Constructors + def __new__(cls, data=None, categories=None, ordered=None, dtype=None, copy=False, name=None, fastpath=None): @@ -212,6 +215,8 @@ def _simple_new(cls, values, name=None, categories=None, ordered=None, result._reset_identity() return result + # -------------------------------------------------------------------- + @Appender(_index_shared_docs['_shallow_copy']) def _shallow_copy(self, values=None, categories=None, ordered=None, dtype=None, **kwargs): @@ -284,6 +289,9 @@ def equals(self, other): return False + # -------------------------------------------------------------------- + # Rendering Methods + @property def _formatter_func(self): return self.categories._formatter_func @@ -307,6 +315,8 @@ def _format_attrs(self): attrs.append(('length', len(self))) return attrs + # -------------------------------------------------------------------- + @property def inferred_type(self): return 'categorical' diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 0e2f7ceb24e94..5e25efe77d8b9 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -331,9 +331,6 @@ def _box_values_as_index(self): from pandas.core.index import Index return Index(self._box_values(self.asi8), name=self.name, dtype=object) - def _format_with_header(self, header, **kwargs): - return header + list(self._format_native_types(**kwargs)) - @Appender(_index_shared_docs['__contains__'] % _index_doc_kwargs) def __contains__(self, key): try: @@ -544,6 +541,12 @@ def argmax(self, axis=None, *args, **kwargs): i8[mask] = 0 return i8.argmax() + # -------------------------------------------------------------------- + # Rendering Methods + + def _format_with_header(self, header, **kwargs): + return header + list(self._format_native_types(**kwargs)) + @property def _formatter_func(self): raise AbstractMethodError(self) @@ -561,6 +564,8 @@ def _format_attrs(self): attrs.append(('freq', freq)) return attrs + # -------------------------------------------------------------------- + def _convert_scalar_indexer(self, key, kind=None): """ We don't allow integer or float indexing on datetime-like when using diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 8b563a9b9bed0..16c1e22d40017 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -372,22 +372,12 @@ def nbytes(self): # for TZ-aware return self._ndarray_values.nbytes - def _mpl_repr(self): - # how to represent ourselves to matplotlib - return libts.ints_to_pydatetime(self.asi8, self.tz) - @cache_readonly def _is_dates_only(self): """Return a boolean if we are only dates (and don't have a timezone)""" from pandas.io.formats.format import _is_dates_only return _is_dates_only(self.values) and self.tz is None - @property - def _formatter_func(self): - from pandas.io.formats.format import _get_format_datetime64 - formatter = _get_format_datetime64(is_dates_only=self._is_dates_only) - return lambda x: "'%s'" % formatter(x, tz=self.tz) - def __reduce__(self): # we use a special reudce here because we need @@ -439,6 +429,13 @@ def _maybe_update_attributes(self, attrs): attrs['freq'] = 'infer' return attrs + # -------------------------------------------------------------------- + # Rendering Methods + + def _mpl_repr(self): + # how to represent ourselves to matplotlib + return libts.ints_to_pydatetime(self.asi8, self.tz) + def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs): from pandas.io.formats.format import _get_format_datetime64_from_values format = _get_format_datetime64_from_values(self, date_format) @@ -448,124 +445,14 @@ def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs): format=format, na_rep=na_rep) - @Appender(_index_shared_docs['astype']) - def astype(self, dtype, copy=True): - dtype = pandas_dtype(dtype) - if (is_datetime64_ns_dtype(dtype) and - not is_dtype_equal(dtype, self.dtype)): - # GH 18951: datetime64_ns dtype but not equal means different tz - new_tz = getattr(dtype, 'tz', None) - if getattr(self.dtype, 'tz', None) is None: - return self.tz_localize(new_tz) - return self.tz_convert(new_tz) - elif is_period_dtype(dtype): - return self.to_period(freq=dtype.freq) - return super(DatetimeIndex, self).astype(dtype, copy=copy) - - def _get_time_micros(self): - values = self.asi8 - if self.tz is not None and not timezones.is_utc(self.tz): - values = self._local_timestamps() - return fields.get_time_micros(values) - - def to_series(self, keep_tz=None, index=None, name=None): - """ - Create a Series with both index and values equal to the index keys - useful with map for returning an indexer based on an index - - Parameters - ---------- - keep_tz : optional, defaults False - Return the data keeping the timezone. - - If keep_tz is True: - - If the timezone is not set, the resulting - Series will have a datetime64[ns] dtype. - - Otherwise the Series will have an datetime64[ns, tz] dtype; the - tz will be preserved. - - If keep_tz is False: - - Series will have a datetime64[ns] dtype. TZ aware - objects will have the tz removed. - - .. versionchanged:: 0.24 - The default value will change to True in a future release. - You can set ``keep_tz=True`` to already obtain the future - behaviour and silence the warning. - - index : Index, optional - index of resulting Series. If None, defaults to original index - name : string, optional - name of resulting Series. If None, defaults to name of original - index - - Returns - ------- - Series - """ - from pandas import Series - - if index is None: - index = self._shallow_copy() - if name is None: - name = self.name - - if keep_tz is None and self.tz is not None: - warnings.warn("The default of the 'keep_tz' keyword will change " - "to True in a future release. You can set " - "'keep_tz=True' to obtain the future behaviour and " - "silence this warning.", FutureWarning, stacklevel=2) - keep_tz = False - elif keep_tz is False: - warnings.warn("Specifying 'keep_tz=False' is deprecated and this " - "option will be removed in a future release. If " - "you want to remove the timezone information, you " - "can do 'idx.tz_convert(None)' before calling " - "'to_series'.", FutureWarning, stacklevel=2) - - if keep_tz and self.tz is not None: - # preserve the tz & copy - values = self.copy(deep=True) - else: - values = self.values.copy() - - return Series(values, index=index, name=name) - - def snap(self, freq='S'): - """ - Snap time stamps to nearest occurring frequency - """ - # Superdumb, punting on any optimizing - freq = to_offset(freq) - - snapped = np.empty(len(self), dtype=_NS_DTYPE) - - for i, v in enumerate(self): - s = v - if not freq.onOffset(s): - t0 = freq.rollback(s) - t1 = freq.rollforward(s) - if abs(s - t0) < abs(t1 - s): - s = t0 - else: - s = t1 - snapped[i] = s - - # we know it conforms; skip check - return DatetimeIndex(snapped, freq=freq, verify_integrity=False) - # TODO: what about self.name? if so, use shallow_copy? - - def unique(self, level=None): - if level is not None: - self._validate_index_level(level) + @property + def _formatter_func(self): + from pandas.io.formats.format import _get_format_datetime64 + formatter = _get_format_datetime64(is_dates_only=self._is_dates_only) + return lambda x: "'%s'" % formatter(x, tz=self.tz) - # TODO(DatetimeArray): change dispatch once inheritance is removed - # call DatetimeArray method - result = DatetimeArray.unique(self) - return self._shallow_copy(result._data) + # -------------------------------------------------------------------- + # Set Operation Methods def union(self, other): """ @@ -634,51 +521,6 @@ def union_many(self, others): return this - def join(self, other, how='left', level=None, return_indexers=False, - sort=False): - """ - See Index.join - """ - if (not isinstance(other, DatetimeIndex) and len(other) > 0 and - other.inferred_type not in ('floating', 'integer', 'mixed-integer', - 'mixed-integer-float', 'mixed')): - try: - other = DatetimeIndex(other) - except (TypeError, ValueError): - pass - - this, other = self._maybe_utc_convert(other) - return Index.join(this, other, how=how, level=level, - return_indexers=return_indexers, sort=sort) - - def _maybe_utc_convert(self, other): - this = self - if isinstance(other, DatetimeIndex): - if self.tz is not None: - if other.tz is None: - raise TypeError('Cannot join tz-naive with tz-aware ' - 'DatetimeIndex') - elif other.tz is not None: - raise TypeError('Cannot join tz-naive with tz-aware ' - 'DatetimeIndex') - - if not timezones.tz_compare(self.tz, other.tz): - this = self.tz_convert('UTC') - other = other.tz_convert('UTC') - return this, other - - def _wrap_joined_index(self, joined, other): - name = get_op_result_name(self, other) - if (isinstance(other, DatetimeIndex) and - self.freq == other.freq and - self._can_fast_union(other)): - joined = self._shallow_copy(joined) - joined.name = name - return joined - else: - tz = getattr(other, 'tz', None) - return self._simple_new(joined, name, tz=tz) - def _can_fast_union(self, other): if not isinstance(other, DatetimeIndex): return False @@ -805,6 +647,172 @@ def intersection(self, other): left_chunk = left.values[lslice] return self._shallow_copy(left_chunk) + # -------------------------------------------------------------------- + + @Appender(_index_shared_docs['astype']) + def astype(self, dtype, copy=True): + dtype = pandas_dtype(dtype) + if (is_datetime64_ns_dtype(dtype) and + not is_dtype_equal(dtype, self.dtype)): + # GH 18951: datetime64_ns dtype but not equal means different tz + new_tz = getattr(dtype, 'tz', None) + if getattr(self.dtype, 'tz', None) is None: + return self.tz_localize(new_tz) + return self.tz_convert(new_tz) + elif is_period_dtype(dtype): + return self.to_period(freq=dtype.freq) + return super(DatetimeIndex, self).astype(dtype, copy=copy) + + def _get_time_micros(self): + values = self.asi8 + if self.tz is not None and not timezones.is_utc(self.tz): + values = self._local_timestamps() + return fields.get_time_micros(values) + + def to_series(self, keep_tz=None, index=None, name=None): + """ + Create a Series with both index and values equal to the index keys + useful with map for returning an indexer based on an index + + Parameters + ---------- + keep_tz : optional, defaults False + Return the data keeping the timezone. + + If keep_tz is True: + + If the timezone is not set, the resulting + Series will have a datetime64[ns] dtype. + + Otherwise the Series will have an datetime64[ns, tz] dtype; the + tz will be preserved. + + If keep_tz is False: + + Series will have a datetime64[ns] dtype. TZ aware + objects will have the tz removed. + + .. versionchanged:: 0.24 + The default value will change to True in a future release. + You can set ``keep_tz=True`` to already obtain the future + behaviour and silence the warning. + + index : Index, optional + index of resulting Series. If None, defaults to original index + name : string, optional + name of resulting Series. If None, defaults to name of original + index + + Returns + ------- + Series + """ + from pandas import Series + + if index is None: + index = self._shallow_copy() + if name is None: + name = self.name + + if keep_tz is None and self.tz is not None: + warnings.warn("The default of the 'keep_tz' keyword will change " + "to True in a future release. You can set " + "'keep_tz=True' to obtain the future behaviour and " + "silence this warning.", FutureWarning, stacklevel=2) + keep_tz = False + elif keep_tz is False: + warnings.warn("Specifying 'keep_tz=False' is deprecated and this " + "option will be removed in a future release. If " + "you want to remove the timezone information, you " + "can do 'idx.tz_convert(None)' before calling " + "'to_series'.", FutureWarning, stacklevel=2) + + if keep_tz and self.tz is not None: + # preserve the tz & copy + values = self.copy(deep=True) + else: + values = self.values.copy() + + return Series(values, index=index, name=name) + + def snap(self, freq='S'): + """ + Snap time stamps to nearest occurring frequency + """ + # Superdumb, punting on any optimizing + freq = to_offset(freq) + + snapped = np.empty(len(self), dtype=_NS_DTYPE) + + for i, v in enumerate(self): + s = v + if not freq.onOffset(s): + t0 = freq.rollback(s) + t1 = freq.rollforward(s) + if abs(s - t0) < abs(t1 - s): + s = t0 + else: + s = t1 + snapped[i] = s + + # we know it conforms; skip check + return DatetimeIndex(snapped, freq=freq, verify_integrity=False) + # TODO: what about self.name? if so, use shallow_copy? + + def unique(self, level=None): + if level is not None: + self._validate_index_level(level) + + # TODO(DatetimeArray): change dispatch once inheritance is removed + # call DatetimeArray method + result = DatetimeArray.unique(self) + return self._shallow_copy(result._data) + + def join(self, other, how='left', level=None, return_indexers=False, + sort=False): + """ + See Index.join + """ + if (not isinstance(other, DatetimeIndex) and len(other) > 0 and + other.inferred_type not in ('floating', 'integer', 'mixed-integer', + 'mixed-integer-float', 'mixed')): + try: + other = DatetimeIndex(other) + except (TypeError, ValueError): + pass + + this, other = self._maybe_utc_convert(other) + return Index.join(this, other, how=how, level=level, + return_indexers=return_indexers, sort=sort) + + def _maybe_utc_convert(self, other): + this = self + if isinstance(other, DatetimeIndex): + if self.tz is not None: + if other.tz is None: + raise TypeError('Cannot join tz-naive with tz-aware ' + 'DatetimeIndex') + elif other.tz is not None: + raise TypeError('Cannot join tz-naive with tz-aware ' + 'DatetimeIndex') + + if not timezones.tz_compare(self.tz, other.tz): + this = self.tz_convert('UTC') + other = other.tz_convert('UTC') + return this, other + + def _wrap_joined_index(self, joined, other): + name = get_op_result_name(self, other) + if (isinstance(other, DatetimeIndex) and + self.freq == other.freq and + self._can_fast_union(other)): + joined = self._shallow_copy(joined) + joined.name = name + return joined + else: + tz = getattr(other, 'tz', None) + return self._simple_new(joined, name, tz=tz) + def _parsed_string_to_bounds(self, reso, parsed): """ Calculate datetime bounds for parsed time string and its resolution. diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 1ebcf213ab0eb..5ee6a816d91f5 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -137,6 +137,9 @@ class IntervalIndex(IntervalMixin, Index): # Immutable, so we are able to cache computations like isna in '_mask' _mask = None + # -------------------------------------------------------------------- + # Constructors + def __new__(cls, data, closed=None, dtype=None, copy=False, name=None, verify_integrity=True): @@ -168,6 +171,50 @@ def _simple_new(cls, array, name, closed=None): result._reset_identity() return result + @classmethod + @Appender(_interval_shared_docs['from_breaks'] % _index_doc_kwargs) + def from_breaks(cls, breaks, closed='right', name=None, copy=False, + dtype=None): + with rewrite_exception("IntervalArray", cls.__name__): + array = IntervalArray.from_breaks(breaks, closed=closed, copy=copy, + dtype=dtype) + return cls._simple_new(array, name=name) + + @classmethod + @Appender(_interval_shared_docs['from_arrays'] % _index_doc_kwargs) + def from_arrays(cls, left, right, closed='right', name=None, copy=False, + dtype=None): + with rewrite_exception("IntervalArray", cls.__name__): + array = IntervalArray.from_arrays(left, right, closed, copy=copy, + dtype=dtype) + return cls._simple_new(array, name=name) + + @classmethod + @Appender(_interval_shared_docs['from_intervals'] % _index_doc_kwargs) + def from_intervals(cls, data, closed=None, name=None, copy=False, + dtype=None): + msg = ('IntervalIndex.from_intervals is deprecated and will be ' + 'removed in a future version; Use IntervalIndex(...) instead') + warnings.warn(msg, FutureWarning, stacklevel=2) + with rewrite_exception("IntervalArray", cls.__name__): + array = IntervalArray(data, closed=closed, copy=copy, dtype=dtype) + + if name is None and isinstance(data, cls): + name = data.name + + return cls._simple_new(array, name=name) + + @classmethod + @Appender(_interval_shared_docs['from_tuples'] % _index_doc_kwargs) + def from_tuples(cls, data, closed='right', name=None, copy=False, + dtype=None): + with rewrite_exception("IntervalArray", cls.__name__): + arr = IntervalArray.from_tuples(data, closed=closed, copy=copy, + dtype=dtype) + return cls._simple_new(arr, name=name) + + # -------------------------------------------------------------------- + @Appender(_index_shared_docs['_shallow_copy']) def _shallow_copy(self, left=None, right=None, **kwargs): result = self._data._shallow_copy(left=left, right=right) @@ -231,48 +278,6 @@ def contains(self, key): except KeyError: return False - @classmethod - @Appender(_interval_shared_docs['from_breaks'] % _index_doc_kwargs) - def from_breaks(cls, breaks, closed='right', name=None, copy=False, - dtype=None): - with rewrite_exception("IntervalArray", cls.__name__): - array = IntervalArray.from_breaks(breaks, closed=closed, copy=copy, - dtype=dtype) - return cls._simple_new(array, name=name) - - @classmethod - @Appender(_interval_shared_docs['from_arrays'] % _index_doc_kwargs) - def from_arrays(cls, left, right, closed='right', name=None, copy=False, - dtype=None): - with rewrite_exception("IntervalArray", cls.__name__): - array = IntervalArray.from_arrays(left, right, closed, copy=copy, - dtype=dtype) - return cls._simple_new(array, name=name) - - @classmethod - @Appender(_interval_shared_docs['from_intervals'] % _index_doc_kwargs) - def from_intervals(cls, data, closed=None, name=None, copy=False, - dtype=None): - msg = ('IntervalIndex.from_intervals is deprecated and will be ' - 'removed in a future version; Use IntervalIndex(...) instead') - warnings.warn(msg, FutureWarning, stacklevel=2) - with rewrite_exception("IntervalArray", cls.__name__): - array = IntervalArray(data, closed=closed, copy=copy, dtype=dtype) - - if name is None and isinstance(data, cls): - name = data.name - - return cls._simple_new(array, name=name) - - @classmethod - @Appender(_interval_shared_docs['from_tuples'] % _index_doc_kwargs) - def from_tuples(cls, data, closed='right', name=None, copy=False, - dtype=None): - with rewrite_exception("IntervalArray", cls.__name__): - arr = IntervalArray.from_tuples(data, closed=closed, copy=copy, - dtype=dtype) - return cls._simple_new(arr, name=name) - @Appender(_interval_shared_docs['to_tuples'] % dict( return_type="Index", examples=""" @@ -941,6 +946,8 @@ def __getitem__(self, value): # scalar return result + # -------------------------------------------------------------------- + # Rendering Methods # __repr__ associated methods are based on MultiIndex def _format_with_header(self, header, **kwargs): @@ -997,6 +1004,8 @@ def _format_space(self): space = ' ' * (len(self.__class__.__name__) + 1) return "\n{space}".format(space=space) + # -------------------------------------------------------------------- + def argsort(self, *args, **kwargs): return np.lexsort((self.right, self.left)) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index ea6dfa6a3a6af..f03376c32f7f4 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -200,6 +200,9 @@ class MultiIndex(Index): _comparables = ['names'] rename = Index.set_names + # -------------------------------------------------------------------- + # Constructors + def __new__(cls, levels=None, labels=None, sortorder=None, names=None, dtype=None, copy=False, name=None, verify_integrity=True, _set_identity=True): @@ -275,6 +278,154 @@ def _verify_integrity(self, labels=None, levels=None): values=[value for value in level], level=i)) + @classmethod + def from_arrays(cls, arrays, sortorder=None, names=None): + """ + Convert arrays to MultiIndex + + Parameters + ---------- + arrays : list / sequence of array-likes + Each array-like gives one level's value for each data point. + len(arrays) is the number of levels. + sortorder : int or None + Level of sortedness (must be lexicographically sorted by that + level) + + Returns + ------- + index : MultiIndex + + Examples + -------- + >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] + >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) + + See Also + -------- + MultiIndex.from_tuples : Convert list of tuples to MultiIndex. + MultiIndex.from_product : Make a MultiIndex from cartesian product + of iterables. + """ + if not is_list_like(arrays): + raise TypeError("Input must be a list / sequence of array-likes.") + elif is_iterator(arrays): + arrays = list(arrays) + + # Check if lengths of all arrays are equal or not, + # raise ValueError, if not + for i in range(1, len(arrays)): + if len(arrays[i]) != len(arrays[i - 1]): + raise ValueError('all arrays must be same length') + + from pandas.core.arrays.categorical import _factorize_from_iterables + + labels, levels = _factorize_from_iterables(arrays) + if names is None: + names = [getattr(arr, "name", None) for arr in arrays] + + return MultiIndex(levels=levels, labels=labels, sortorder=sortorder, + names=names, verify_integrity=False) + + @classmethod + def from_tuples(cls, tuples, sortorder=None, names=None): + """ + Convert list of tuples to MultiIndex + + Parameters + ---------- + tuples : list / sequence of tuple-likes + Each tuple is the index of one row/column. + sortorder : int or None + Level of sortedness (must be lexicographically sorted by that + level) + + Returns + ------- + index : MultiIndex + + Examples + -------- + >>> tuples = [(1, u'red'), (1, u'blue'), + (2, u'red'), (2, u'blue')] + >>> pd.MultiIndex.from_tuples(tuples, names=('number', 'color')) + + See Also + -------- + MultiIndex.from_arrays : Convert list of arrays to MultiIndex + MultiIndex.from_product : Make a MultiIndex from cartesian product + of iterables + """ + if not is_list_like(tuples): + raise TypeError('Input must be a list / sequence of tuple-likes.') + elif is_iterator(tuples): + tuples = list(tuples) + + if len(tuples) == 0: + if names is None: + msg = 'Cannot infer number of levels from empty list' + raise TypeError(msg) + arrays = [[]] * len(names) + elif isinstance(tuples, (np.ndarray, Index)): + if isinstance(tuples, Index): + tuples = tuples._values + + arrays = list(lib.tuples_to_object_array(tuples).T) + elif isinstance(tuples, list): + arrays = list(lib.to_object_array_tuples(tuples).T) + else: + arrays = lzip(*tuples) + + return MultiIndex.from_arrays(arrays, sortorder=sortorder, names=names) + + @classmethod + def from_product(cls, iterables, sortorder=None, names=None): + """ + Make a MultiIndex from the cartesian product of multiple iterables + + Parameters + ---------- + iterables : list / sequence of iterables + Each iterable has unique labels for each level of the index. + sortorder : int or None + Level of sortedness (must be lexicographically sorted by that + level). + names : list / sequence of strings or None + Names for the levels in the index. + + Returns + ------- + index : MultiIndex + + Examples + -------- + >>> numbers = [0, 1, 2] + >>> colors = [u'green', u'purple'] + >>> pd.MultiIndex.from_product([numbers, colors], + names=['number', 'color']) + MultiIndex(levels=[[0, 1, 2], [u'green', u'purple']], + labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], + names=[u'number', u'color']) + + See Also + -------- + MultiIndex.from_arrays : Convert list of arrays to MultiIndex. + MultiIndex.from_tuples : Convert list of tuples to MultiIndex. + """ + from pandas.core.arrays.categorical import _factorize_from_iterables + from pandas.core.reshape.util import cartesian_product + + if not is_list_like(iterables): + raise TypeError("Input must be a list / sequence of iterables.") + elif is_iterator(iterables): + iterables = list(iterables) + + labels, levels = _factorize_from_iterables(iterables) + labels = cartesian_product(labels) + return MultiIndex(levels, labels, sortorder=sortorder, names=names) + + # -------------------------------------------------------------------- + @property def levels(self): return self._levels @@ -622,6 +773,9 @@ def _nbytes(self, deep=False): result += self._engine.sizeof(deep=deep) return result + # -------------------------------------------------------------------- + # Rendering Methods + def _format_attrs(self): """ Return a list of tuples of the (attr,formatted_value) @@ -644,6 +798,94 @@ def _format_data(self, name=None): # we are formatting thru the attributes return None + def _format_native_types(self, na_rep='nan', **kwargs): + new_levels = [] + new_labels = [] + + # go through the levels and format them + for level, label in zip(self.levels, self.labels): + level = level._format_native_types(na_rep=na_rep, **kwargs) + # add nan values, if there are any + mask = (label == -1) + if mask.any(): + nan_index = len(level) + level = np.append(level, na_rep) + label = label.values() + label[mask] = nan_index + new_levels.append(level) + new_labels.append(label) + + if len(new_levels) == 1: + return Index(new_levels[0])._format_native_types() + else: + # reconstruct the multi-index + mi = MultiIndex(levels=new_levels, labels=new_labels, + names=self.names, sortorder=self.sortorder, + verify_integrity=False) + return mi.values + + def format(self, space=2, sparsify=None, adjoin=True, names=False, + na_rep=None, formatter=None): + if len(self) == 0: + return [] + + stringified_levels = [] + for lev, lab in zip(self.levels, self.labels): + na = na_rep if na_rep is not None else _get_na_rep(lev.dtype.type) + + if len(lev) > 0: + + formatted = lev.take(lab).format(formatter=formatter) + + # we have some NA + mask = lab == -1 + if mask.any(): + formatted = np.array(formatted, dtype=object) + formatted[mask] = na + formatted = formatted.tolist() + + else: + # weird all NA case + formatted = [pprint_thing(na if isna(x) else x, + escape_chars=('\t', '\r', '\n')) + for x in algos.take_1d(lev._values, lab)] + stringified_levels.append(formatted) + + result_levels = [] + for lev, name in zip(stringified_levels, self.names): + level = [] + + if names: + level.append(pprint_thing(name, + escape_chars=('\t', '\r', '\n')) + if name is not None else '') + + level.extend(np.array(lev, dtype=object)) + result_levels.append(level) + + if sparsify is None: + sparsify = get_option("display.multi_sparse") + + if sparsify: + sentinel = '' + # GH3547 + # use value of sparsify as sentinel, unless it's an obvious + # "Truthey" value + if sparsify not in [True, 1]: + sentinel = sparsify + # little bit of a kludge job for #1217 + result_levels = _sparsify(result_levels, start=int(names), + sentinel=sentinel) + + if adjoin: + from pandas.io.formats.format import _get_adjustment + adj = _get_adjustment() + return adj.adjoin(space, *result_levels).split('\n') + else: + return result_levels + + # -------------------------------------------------------------------- + def __len__(self): return len(self.labels[0]) @@ -705,32 +947,6 @@ def _set_names(self, names, level=None, validate=True): names = property(fset=_set_names, fget=_get_names, doc="Names of levels in MultiIndex") - def _format_native_types(self, na_rep='nan', **kwargs): - new_levels = [] - new_labels = [] - - # go through the levels and format them - for level, label in zip(self.levels, self.labels): - level = level._format_native_types(na_rep=na_rep, **kwargs) - # add nan values, if there are any - mask = (label == -1) - if mask.any(): - nan_index = len(level) - level = np.append(level, na_rep) - label = label.values() - label[mask] = nan_index - new_levels.append(level) - new_labels.append(label) - - if len(new_levels) == 1: - return Index(new_levels[0])._format_native_types() - else: - # reconstruct the multi-index - mi = MultiIndex(levels=new_levels, labels=new_labels, - names=self.names, sortorder=self.sortorder, - verify_integrity=False) - return mi.values - @Appender(_index_shared_docs['_get_grouper_for_level']) def _get_grouper_for_level(self, mapper, level): indexer = self.labels[level] @@ -1081,66 +1297,6 @@ def unique(self, level=None): level = self._get_level_number(level) return self._get_level_values(level=level, unique=True) - def format(self, space=2, sparsify=None, adjoin=True, names=False, - na_rep=None, formatter=None): - if len(self) == 0: - return [] - - stringified_levels = [] - for lev, lab in zip(self.levels, self.labels): - na = na_rep if na_rep is not None else _get_na_rep(lev.dtype.type) - - if len(lev) > 0: - - formatted = lev.take(lab).format(formatter=formatter) - - # we have some NA - mask = lab == -1 - if mask.any(): - formatted = np.array(formatted, dtype=object) - formatted[mask] = na - formatted = formatted.tolist() - - else: - # weird all NA case - formatted = [pprint_thing(na if isna(x) else x, - escape_chars=('\t', '\r', '\n')) - for x in algos.take_1d(lev._values, lab)] - stringified_levels.append(formatted) - - result_levels = [] - for lev, name in zip(stringified_levels, self.names): - level = [] - - if names: - level.append(pprint_thing(name, - escape_chars=('\t', '\r', '\n')) - if name is not None else '') - - level.extend(np.array(lev, dtype=object)) - result_levels.append(level) - - if sparsify is None: - sparsify = get_option("display.multi_sparse") - - if sparsify: - sentinel = '' - # GH3547 - # use value of sparsify as sentinel, unless it's an obvious - # "Truthey" value - if sparsify not in [True, 1]: - sentinel = sparsify - # little bit of a kludge job for #1217 - result_levels = _sparsify(result_levels, start=int(names), - sentinel=sentinel) - - if adjoin: - from pandas.io.formats.format import _get_adjustment - adj = _get_adjustment() - return adj.adjoin(space, *result_levels).split('\n') - else: - return result_levels - def _to_safe_for_reshape(self): """ convert to object if we are a categorical """ return self.set_levels([i._to_safe_for_reshape() for i in self.levels]) @@ -1289,152 +1445,6 @@ def lexsort_depth(self): return 0 - @classmethod - def from_arrays(cls, arrays, sortorder=None, names=None): - """ - Convert arrays to MultiIndex - - Parameters - ---------- - arrays : list / sequence of array-likes - Each array-like gives one level's value for each data point. - len(arrays) is the number of levels. - sortorder : int or None - Level of sortedness (must be lexicographically sorted by that - level) - - Returns - ------- - index : MultiIndex - - Examples - -------- - >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] - >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) - - See Also - -------- - MultiIndex.from_tuples : Convert list of tuples to MultiIndex. - MultiIndex.from_product : Make a MultiIndex from cartesian product - of iterables. - """ - if not is_list_like(arrays): - raise TypeError("Input must be a list / sequence of array-likes.") - elif is_iterator(arrays): - arrays = list(arrays) - - # Check if lengths of all arrays are equal or not, - # raise ValueError, if not - for i in range(1, len(arrays)): - if len(arrays[i]) != len(arrays[i - 1]): - raise ValueError('all arrays must be same length') - - from pandas.core.arrays.categorical import _factorize_from_iterables - - labels, levels = _factorize_from_iterables(arrays) - if names is None: - names = [getattr(arr, "name", None) for arr in arrays] - - return MultiIndex(levels=levels, labels=labels, sortorder=sortorder, - names=names, verify_integrity=False) - - @classmethod - def from_tuples(cls, tuples, sortorder=None, names=None): - """ - Convert list of tuples to MultiIndex - - Parameters - ---------- - tuples : list / sequence of tuple-likes - Each tuple is the index of one row/column. - sortorder : int or None - Level of sortedness (must be lexicographically sorted by that - level) - - Returns - ------- - index : MultiIndex - - Examples - -------- - >>> tuples = [(1, u'red'), (1, u'blue'), - (2, u'red'), (2, u'blue')] - >>> pd.MultiIndex.from_tuples(tuples, names=('number', 'color')) - - See Also - -------- - MultiIndex.from_arrays : Convert list of arrays to MultiIndex - MultiIndex.from_product : Make a MultiIndex from cartesian product - of iterables - """ - if not is_list_like(tuples): - raise TypeError('Input must be a list / sequence of tuple-likes.') - elif is_iterator(tuples): - tuples = list(tuples) - - if len(tuples) == 0: - if names is None: - msg = 'Cannot infer number of levels from empty list' - raise TypeError(msg) - arrays = [[]] * len(names) - elif isinstance(tuples, (np.ndarray, Index)): - if isinstance(tuples, Index): - tuples = tuples._values - - arrays = list(lib.tuples_to_object_array(tuples).T) - elif isinstance(tuples, list): - arrays = list(lib.to_object_array_tuples(tuples).T) - else: - arrays = lzip(*tuples) - - return MultiIndex.from_arrays(arrays, sortorder=sortorder, names=names) - - @classmethod - def from_product(cls, iterables, sortorder=None, names=None): - """ - Make a MultiIndex from the cartesian product of multiple iterables - - Parameters - ---------- - iterables : list / sequence of iterables - Each iterable has unique labels for each level of the index. - sortorder : int or None - Level of sortedness (must be lexicographically sorted by that - level). - names : list / sequence of strings or None - Names for the levels in the index. - - Returns - ------- - index : MultiIndex - - Examples - -------- - >>> numbers = [0, 1, 2] - >>> colors = [u'green', u'purple'] - >>> pd.MultiIndex.from_product([numbers, colors], - names=['number', 'color']) - MultiIndex(levels=[[0, 1, 2], [u'green', u'purple']], - labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], - names=[u'number', u'color']) - - See Also - -------- - MultiIndex.from_arrays : Convert list of arrays to MultiIndex. - MultiIndex.from_tuples : Convert list of tuples to MultiIndex. - """ - from pandas.core.arrays.categorical import _factorize_from_iterables - from pandas.core.reshape.util import cartesian_product - - if not is_list_like(iterables): - raise TypeError("Input must be a list / sequence of iterables.") - elif is_iterator(iterables): - iterables = list(iterables) - - labels, levels = _factorize_from_iterables(iterables) - labels = cartesian_product(labels) - return MultiIndex(levels, labels, sortorder=sortorder, names=names) - def _sort_levels_monotonic(self): """ .. versionadded:: 0.20.0 diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index fec3a9bd24cc8..56df454bddf1c 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -364,12 +364,6 @@ def to_timestamp(self, freq=None, how='start'): name=self.name, freq=result.freq) - def _format_native_types(self, na_rep=u'NaT', quoting=None, **kwargs): - # just dispatch, return ndarray - return self._data._format_native_types(na_rep=na_rep, - quoting=quoting, - **kwargs) - def _maybe_convert_timedelta(self, other): """ Convert timedelta-like input to an integer multiple of self.freq @@ -412,6 +406,19 @@ def _maybe_convert_timedelta(self, other): raise IncompatibleFrequency(msg.format(cls=type(self).__name__, freqstr=self.freqstr)) + # ------------------------------------------------------------------------ + # Rendering Methods + + def _format_native_types(self, na_rep=u'NaT', quoting=None, **kwargs): + # just dispatch, return ndarray + return self._data._format_native_types(na_rep=na_rep, + quoting=quoting, + **kwargs) + + def _mpl_repr(self): + # how to represent ourselves to matplotlib + return self.astype(object).values + # ------------------------------------------------------------------------ # Indexing @@ -595,10 +602,6 @@ def is_full(self): values = self.asi8 return ((values[1:] - values[:-1]) < 2).all() - def _mpl_repr(self): - # how to represent ourselves to matplotlib - return self.astype(object).values - @property def inferred_type(self): # b/c data is represented as ints make sure we can't have ambiguous diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index d6286244fcb7e..364aadb9523f0 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -25,7 +25,6 @@ class RangeIndex(Int64Index): - """ Immutable Index implementing a monotonic integer range. @@ -64,6 +63,9 @@ class RangeIndex(Int64Index): _typ = 'rangeindex' _engine_type = libindex.Int64Engine + # -------------------------------------------------------------------- + # Constructors + def __new__(cls, start=None, stop=None, step=None, dtype=None, copy=False, name=None, fastpath=None): @@ -158,6 +160,8 @@ def _simple_new(cls, start, stop=None, step=None, name=None, result._reset_identity() return result + # -------------------------------------------------------------------- + @staticmethod def _validate_dtype(dtype): """ require dtype to be None or int64 """ @@ -188,6 +192,9 @@ def __reduce__(self): d.update(dict(self._get_data_as_items())) return ibase._new_Index, (self.__class__, d), None + # -------------------------------------------------------------------- + # Rendering Methods + def _format_attrs(self): """ Return a list of tuples of the (attr, formatted_value) @@ -201,6 +208,8 @@ def _format_data(self, name=None): # we are formatting thru the attributes return None + # -------------------------------------------------------------------- + @cache_readonly def nbytes(self): """ diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 8f50b40a20738..9ceb49a60edd2 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -127,6 +127,9 @@ def _join_i8_wrapper(joinf, **kwargs): _freq = None + # ------------------------------------------------------------------- + # Constructors + def __new__(cls, data=None, unit=None, freq=None, start=None, end=None, periods=None, closed=None, dtype=None, copy=False, name=None, verify_integrity=True): @@ -193,10 +196,7 @@ def _simple_new(cls, values, name=None, freq=None, dtype=_TD_DTYPE): result._reset_identity() return result - @property - def _formatter_func(self): - from pandas.io.formats.format import _get_format_timedelta64 - return _get_format_timedelta64(self, box=True) + # ------------------------------------------------------------------- def __setstate__(self, state): """Necessary for making this object picklable""" @@ -218,6 +218,14 @@ def _evaluate_with_timedelta_like(self, other, op): result = TimedeltaArray._evaluate_with_timedelta_like(self, other, op) return wrap_arithmetic_op(self, other, result) + # ------------------------------------------------------------------- + # Rendering Methods + + @property + def _formatter_func(self): + from pandas.io.formats.format import _get_format_timedelta64 + return _get_format_timedelta64(self, box=True) + def _format_native_types(self, na_rep=u'NaT', date_format=None, **kwargs): from pandas.io.formats.format import Timedelta64Formatter return Timedelta64Formatter(values=self, From 9319549003a0b69a605d3f5335b98d60ad4c28ea Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 27 Nov 2018 18:21:11 -0800 Subject: [PATCH 07/10] dummy commit to force CI From 4b253537016d067720092df0f2781bb35fd0a7fc Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 27 Nov 2018 18:44:51 -0800 Subject: [PATCH 08/10] collect more --- pandas/core/indexes/base.py | 1215 ++++++++++++++++++----------------- 1 file changed, 620 insertions(+), 595 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a622e328b7d4e..c1df1cab5884a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -527,6 +527,12 @@ def _constructor(self): # -------------------------------------------------------------------- + def _get_attributes_dict(self): + """ + Return an attributes dict for my class. + """ + return {k: getattr(self, k, None) for k in self._attributes} + _index_shared_docs['_shallow_copy'] = """ Create a new Index with the same class as the caller, don't copy the data, use the same object attributes with passed in attributes taking @@ -617,36 +623,6 @@ def _update_inplace(self, result, **kwargs): # guard when called from IndexOpsMixin raise TypeError("Index can't be updated inplace") - _index_shared_docs['_get_grouper_for_level'] = """ - Get index grouper corresponding to an index level - - Parameters - ---------- - mapper: Group mapping function or None - Function mapping index values to groups - level : int or None - Index level - - Returns - ------- - grouper : Index - Index of values to group on - labels : ndarray of int or None - Array of locations in level_index - uniques : Index or None - Index of unique values for level - """ - - @Appender(_index_shared_docs['_get_grouper_for_level']) - def _get_grouper_for_level(self, mapper, level=None): - assert level is None or level == 0 - if mapper is None: - grouper = self - else: - grouper = self.map(mapper) - - return grouper, None, None - def is_(self, other): """ More flexible, faster check like ``is`` but that works through views. @@ -674,6 +650,14 @@ def _reset_identity(self): self._id = _Identity() return self + def _cleanup(self): + self._engine.clear_mapping() + + @cache_readonly + def _engine(self): + # property, for now, slow to look up + return self._engine_type(lambda: self._ndarray_values, len(self)) + # -------------------------------------------------------------------- # Array-Like Methods @@ -838,6 +822,45 @@ def _assert_take_fillable(self, values, indices, allow_fill=True, taken = values.take(indices) return taken + # ops compat + def repeat(self, repeats, *args, **kwargs): + """ + Repeat elements of an Index. + + Returns a new index where each element of the current index + is repeated consecutively a given number of times. + + Parameters + ---------- + repeats : int + The number of repetitions for each element. + **kwargs + Additional keywords have no effect but might be accepted for + compatibility with numpy. + + Returns + ------- + pandas.Index + Newly created Index with repeated elements. + + See Also + -------- + Series.repeat : Equivalent function for Series. + numpy.repeat : Underlying implementation. + + Examples + -------- + >>> idx = pd.Index([1, 2, 3]) + >>> idx + Int64Index([1, 2, 3], dtype='int64') + >>> idx.repeat(2) + Int64Index([1, 1, 2, 2, 3, 3], dtype='int64') + >>> idx.repeat(3) + Int64Index([1, 1, 1, 2, 2, 2, 3, 3, 3], dtype='int64') + """ + nv.validate_repeat(args, kwargs) + return self._shallow_copy(self._values.repeat(repeats)) + # -------------------------------------------------------------------- @property @@ -930,45 +953,6 @@ def memory_usage(self, deep=False): result += self._engine.sizeof(deep=deep) return result - # ops compat - def repeat(self, repeats, *args, **kwargs): - """ - Repeat elements of an Index. - - Returns a new index where each element of the current index - is repeated consecutively a given number of times. - - Parameters - ---------- - repeats : int - The number of repetitions for each element. - **kwargs - Additional keywords have no effect but might be accepted for - compatibility with numpy. - - Returns - ------- - pandas.Index - Newly created Index with repeated elements. - - See Also - -------- - Series.repeat : Equivalent function for Series. - numpy.repeat : Underlying implementation. - - Examples - -------- - >>> idx = pd.Index([1, 2, 3]) - >>> idx - Int64Index([1, 2, 3], dtype='int64') - >>> idx.repeat(2) - Int64Index([1, 1, 2, 2, 3, 3], dtype='int64') - >>> idx.repeat(3) - Int64Index([1, 1, 1, 2, 2, 2, 3, 3, 3], dtype='int64') - """ - nv.validate_repeat(args, kwargs) - return self._shallow_copy(self._values.repeat(repeats)) - _index_shared_docs['where'] = """ Return an Index of same shape as self and whose corresponding entries are from self where cond is True and otherwise are from @@ -1084,12 +1068,6 @@ def _coerce_to_ndarray(cls, data): data = np.asarray(data) return data - def _get_attributes_dict(self): - """ - Return an attributes dict for my class. - """ - return {k: getattr(self, k, None) for k in self._attributes} - def _coerce_scalar_to_index(self, item): """ We need to coerce a scalar to a compat for our index type. @@ -1749,6 +1727,36 @@ def droplevel(self, level=0): return MultiIndex(levels=new_levels, labels=new_labels, names=new_names, verify_integrity=False) + _index_shared_docs['_get_grouper_for_level'] = """ + Get index grouper corresponding to an index level + + Parameters + ---------- + mapper: Group mapping function or None + Function mapping index values to groups + level : int or None + Index level + + Returns + ------- + grouper : Index + Index of values to group on + labels : ndarray of int or None + Array of locations in level_index + uniques : Index or None + Index of unique values for level + """ + + @Appender(_index_shared_docs['_get_grouper_for_level']) + def _get_grouper_for_level(self, mapper, level=None): + assert level is None or level == 0 + if mapper is None: + grouper = self + else: + grouper = self.map(mapper) + + return grouper, None, None + # -------------------------------------------------------------------- def _to_safe_for_reshape(self): @@ -1757,19 +1765,6 @@ def _to_safe_for_reshape(self): """ return self - def _assert_can_do_setop(self, other): - if not is_list_like(other): - raise TypeError('Input must be Index or array-like') - return True - - def _convert_can_do_setop(self, other): - if not isinstance(other, Index): - other = Index(other, name=self.name) - result_name = self.name - else: - result_name = get_op_result_name(self, other) - return other, result_name - def _convert_for_op(self, value): """ Convert value to be insertable to ndarray. @@ -1830,9 +1825,6 @@ def summary(self, name=None): "future version.", FutureWarning, stacklevel=2) return self._summary(name) - _na_value = np.nan - """The expected NA value to use with this index.""" - # -------------------------------------------------------------------- # Introspection Methods @@ -1999,59 +1991,515 @@ def is_all_dates(self): return is_datetime_array(ensure_object(self.values)) # -------------------------------------------------------------------- + # Pickle Methods - _index_shared_docs['_convert_scalar_indexer'] = """ - Convert a scalar indexer. - - Parameters - ---------- - key : label of the slice bound - kind : {'ix', 'loc', 'getitem', 'iloc'} or None - """ - - @Appender(_index_shared_docs['_convert_scalar_indexer']) - def _convert_scalar_indexer(self, key, kind=None): - assert kind in ['ix', 'loc', 'getitem', 'iloc', None] + def __reduce__(self): + d = dict(data=self._data) + d.update(self._get_attributes_dict()) + return _new_Index, (self.__class__, d), None - if kind == 'iloc': - return self._validate_indexer('positional', key, kind) + def __setstate__(self, state): + """ + Necessary for making this object picklable. + """ - if len(self) and not isinstance(self, ABCMultiIndex,): + if isinstance(state, dict): + self._data = state.pop('data') + for k, v in compat.iteritems(state): + setattr(self, k, v) - # we can raise here if we are definitive that this - # is positional indexing (eg. .ix on with a float) - # or label indexing if we are using a type able - # to be represented in the index + elif isinstance(state, tuple): - if kind in ['getitem', 'ix'] and is_float(key): - if not self.is_floating(): - return self._invalid_indexer('label', key) + if len(state) == 2: + nd_state, own_state = state + data = np.empty(nd_state[1], dtype=nd_state[2]) + np.ndarray.__setstate__(data, nd_state) + self.name = own_state[0] - elif kind in ['loc'] and is_float(key): + else: # pragma: no cover + data = np.empty(state) + np.ndarray.__setstate__(data, state) - # we want to raise KeyError on string/mixed here - # technically we *could* raise a TypeError - # on anything but mixed though - if self.inferred_type not in ['floating', - 'mixed-integer-float', - 'string', - 'unicode', - 'mixed']: - return self._invalid_indexer('label', key) + self._data = data + self._reset_identity() + else: + raise Exception("invalid pickle state") - elif kind in ['loc'] and is_integer(key): - if not self.holds_integer(): - return self._invalid_indexer('label', key) + _unpickle_compat = __setstate__ - return key + # -------------------------------------------------------------------- + # Null Handling Methods - _index_shared_docs['_convert_slice_indexer'] = """ - Convert a slice indexer. + _na_value = np.nan + """The expected NA value to use with this index.""" - By definition, these are labels unless 'iloc' is passed in. - Floats are not allowed as the start, step, or stop of the slice. + @cache_readonly + def _isnan(self): + """ + Return if each value is NaN. + """ + if self._can_hold_na: + return isna(self) + else: + # shouldn't reach to this condition by checking hasnans beforehand + values = np.empty(len(self), dtype=np.bool_) + values.fill(False) + return values - Parameters + @cache_readonly + def _nan_idxs(self): + if self._can_hold_na: + w, = self._isnan.nonzero() + return w + else: + return np.array([], dtype=np.int64) + + @cache_readonly + def hasnans(self): + """ + Return if I have any nans; enables various perf speedups. + """ + if self._can_hold_na: + return bool(self._isnan.any()) + else: + return False + + def isna(self): + """ + Detect missing values. + + Return a boolean same-sized object indicating if the values are NA. + NA values, such as ``None``, :attr:`numpy.NaN` or :attr:`pd.NaT`, get + mapped to ``True`` values. + Everything else get mapped to ``False`` values. Characters such as + empty strings `''` or :attr:`numpy.inf` are not considered NA values + (unless you set ``pandas.options.mode.use_inf_as_na = True``). + + .. versionadded:: 0.20.0 + + Returns + ------- + numpy.ndarray + A boolean array of whether my values are NA + + See Also + -------- + pandas.Index.notna : Boolean inverse of isna. + pandas.Index.dropna : Omit entries with missing values. + pandas.isna : Top-level isna. + Series.isna : Detect missing values in Series object. + + Examples + -------- + Show which entries in a pandas.Index are NA. The result is an + array. + + >>> idx = pd.Index([5.2, 6.0, np.NaN]) + >>> idx + Float64Index([5.2, 6.0, nan], dtype='float64') + >>> idx.isna() + array([False, False, True], dtype=bool) + + Empty strings are not considered NA values. None is considered an NA + value. + + >>> idx = pd.Index(['black', '', 'red', None]) + >>> idx + Index(['black', '', 'red', None], dtype='object') + >>> idx.isna() + array([False, False, False, True], dtype=bool) + + For datetimes, `NaT` (Not a Time) is considered as an NA value. + + >>> idx = pd.DatetimeIndex([pd.Timestamp('1940-04-25'), + ... pd.Timestamp(''), None, pd.NaT]) + >>> idx + DatetimeIndex(['1940-04-25', 'NaT', 'NaT', 'NaT'], + dtype='datetime64[ns]', freq=None) + >>> idx.isna() + array([False, True, True, True], dtype=bool) + """ + return self._isnan + isnull = isna + + def notna(self): + """ + Detect existing (non-missing) values. + + Return a boolean same-sized object indicating if the values are not NA. + Non-missing values get mapped to ``True``. Characters such as empty + strings ``''`` or :attr:`numpy.inf` are not considered NA values + (unless you set ``pandas.options.mode.use_inf_as_na = True``). + NA values, such as None or :attr:`numpy.NaN`, get mapped to ``False`` + values. + + .. versionadded:: 0.20.0 + + Returns + ------- + numpy.ndarray + Boolean array to indicate which entries are not NA. + + See Also + -------- + Index.notnull : Alias of notna. + Index.isna: Inverse of notna. + pandas.notna : Top-level notna. + + Examples + -------- + Show which entries in an Index are not NA. The result is an + array. + + >>> idx = pd.Index([5.2, 6.0, np.NaN]) + >>> idx + Float64Index([5.2, 6.0, nan], dtype='float64') + >>> idx.notna() + array([ True, True, False]) + + Empty strings are not considered NA values. None is considered a NA + value. + + >>> idx = pd.Index(['black', '', 'red', None]) + >>> idx + Index(['black', '', 'red', None], dtype='object') + >>> idx.notna() + array([ True, True, True, False]) + """ + return ~self.isna() + notnull = notna + + _index_shared_docs['fillna'] = """ + Fill NA/NaN values with the specified value + + Parameters + ---------- + value : scalar + Scalar value to use to fill holes (e.g. 0). + This value cannot be a list-likes. + downcast : dict, default is None + a dict of item->dtype of what to downcast if possible, + or the string 'infer' which will try to downcast to an appropriate + equal type (e.g. float64 to int64 if possible) + + Returns + ------- + filled : %(klass)s + """ + + @Appender(_index_shared_docs['fillna']) + def fillna(self, value=None, downcast=None): + self._assert_can_do_op(value) + if self.hasnans: + result = self.putmask(self._isnan, value) + if downcast is None: + # no need to care metadata other than name + # because it can't have freq if + return Index(result, name=self.name) + return self._shallow_copy() + + _index_shared_docs['dropna'] = """ + Return Index without NA/NaN values + + Parameters + ---------- + how : {'any', 'all'}, default 'any' + If the Index is a MultiIndex, drop the value when any or all levels + are NaN. + + Returns + ------- + valid : Index + """ + + @Appender(_index_shared_docs['dropna']) + def dropna(self, how='any'): + if how not in ('any', 'all'): + raise ValueError("invalid how option: {0}".format(how)) + + if self.hasnans: + return self._shallow_copy(self.values[~self._isnan]) + return self._shallow_copy() + + # -------------------------------------------------------------------- + # Uniqueness Methods + + _index_shared_docs['index_unique'] = ( + """ + Return unique values in the index. Uniques are returned in order + of appearance, this does NOT sort. + + Parameters + ---------- + level : int or str, optional, default None + Only return values from specified level (for MultiIndex) + + .. versionadded:: 0.23.0 + + Returns + ------- + Index without duplicates + + See Also + -------- + unique + Series.unique + """) + + @Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs) + def unique(self, level=None): + if level is not None: + self._validate_index_level(level) + result = super(Index, self).unique() + return self._shallow_copy(result) + + def drop_duplicates(self, keep='first'): + """ + Return Index with duplicate values removed. + + Parameters + ---------- + keep : {'first', 'last', ``False``}, default 'first' + - 'first' : Drop duplicates except for the first occurrence. + - 'last' : Drop duplicates except for the last occurrence. + - ``False`` : Drop all duplicates. + + Returns + ------- + deduplicated : Index + + See Also + -------- + Series.drop_duplicates : Equivalent method on Series. + DataFrame.drop_duplicates : Equivalent method on DataFrame. + Index.duplicated : Related method on Index, indicating duplicate + Index values. + + Examples + -------- + Generate an pandas.Index with duplicate values. + + >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo']) + + The `keep` parameter controls which duplicate values are removed. + The value 'first' keeps the first occurrence for each + set of duplicated entries. The default value of keep is 'first'. + + >>> idx.drop_duplicates(keep='first') + Index(['lama', 'cow', 'beetle', 'hippo'], dtype='object') + + The value 'last' keeps the last occurrence for each set of duplicated + entries. + + >>> idx.drop_duplicates(keep='last') + Index(['cow', 'beetle', 'lama', 'hippo'], dtype='object') + + The value ``False`` discards all sets of duplicated entries. + + >>> idx.drop_duplicates(keep=False) + Index(['cow', 'beetle', 'hippo'], dtype='object') + """ + return super(Index, self).drop_duplicates(keep=keep) + + def duplicated(self, keep='first'): + """ + Indicate duplicate index values. + + Duplicated values are indicated as ``True`` values in the resulting + array. Either all duplicates, all except the first, or all except the + last occurrence of duplicates can be indicated. + + Parameters + ---------- + keep : {'first', 'last', False}, default 'first' + The value or values in a set of duplicates to mark as missing. + + - 'first' : Mark duplicates as ``True`` except for the first + occurrence. + - 'last' : Mark duplicates as ``True`` except for the last + occurrence. + - ``False`` : Mark all duplicates as ``True``. + + Examples + -------- + By default, for each set of duplicated values, the first occurrence is + set to False and all others to True: + + >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama']) + >>> idx.duplicated() + array([False, False, True, False, True]) + + which is equivalent to + + >>> idx.duplicated(keep='first') + array([False, False, True, False, True]) + + By using 'last', the last occurrence of each set of duplicated values + is set on False and all others on True: + + >>> idx.duplicated(keep='last') + array([ True, False, True, False, False]) + + By setting keep on ``False``, all duplicates are True: + + >>> idx.duplicated(keep=False) + array([ True, False, True, False, True]) + + Returns + ------- + numpy.ndarray + + See Also + -------- + pandas.Series.duplicated : Equivalent method on pandas.Series. + pandas.DataFrame.duplicated : Equivalent method on pandas.DataFrame. + pandas.Index.drop_duplicates : Remove duplicate values from Index. + """ + return super(Index, self).duplicated(keep=keep) + + def get_duplicates(self): + """ + Extract duplicated index elements. + + Returns a sorted list of index elements which appear more than once in + the index. + + .. deprecated:: 0.23.0 + Use idx[idx.duplicated()].unique() instead + + Returns + ------- + array-like + List of duplicated indexes. + + See Also + -------- + Index.duplicated : Return boolean array denoting duplicates. + Index.drop_duplicates : Return Index with duplicates removed. + + Examples + -------- + + Works on different Index of types. + + >>> pd.Index([1, 2, 2, 3, 3, 3, 4]).get_duplicates() # doctest: +SKIP + [2, 3] + + Note that for a DatetimeIndex, it does not return a list but a new + DatetimeIndex: + + >>> dates = pd.to_datetime(['2018-01-01', '2018-01-02', '2018-01-03', + ... '2018-01-03', '2018-01-04', '2018-01-04'], + ... format='%Y-%m-%d') + >>> pd.Index(dates).get_duplicates() # doctest: +SKIP + DatetimeIndex(['2018-01-03', '2018-01-04'], + dtype='datetime64[ns]', freq=None) + + Sorts duplicated elements even when indexes are unordered. + + >>> pd.Index([1, 2, 3, 2, 3, 4, 3]).get_duplicates() # doctest: +SKIP + [2, 3] + + Return empty array-like structure when all elements are unique. + + >>> pd.Index([1, 2, 3, 4]).get_duplicates() # doctest: +SKIP + [] + >>> dates = pd.to_datetime(['2018-01-01', '2018-01-02', '2018-01-03'], + ... format='%Y-%m-%d') + >>> pd.Index(dates).get_duplicates() # doctest: +SKIP + DatetimeIndex([], dtype='datetime64[ns]', freq=None) + """ + warnings.warn("'get_duplicates' is deprecated and will be removed in " + "a future release. You can use " + "idx[idx.duplicated()].unique() instead", + FutureWarning, stacklevel=2) + + return self[self.duplicated()].unique() + + def _get_unique_index(self, dropna=False): + """ + Returns an index containing unique values. + + Parameters + ---------- + dropna : bool + If True, NaN values are dropped. + + Returns + ------- + uniques : index + """ + if self.is_unique and not dropna: + return self + + values = self.values + + if not self.is_unique: + values = self.unique() + + if dropna: + try: + if self.hasnans: + values = values[~isna(values)] + except NotImplementedError: + pass + + return self._shallow_copy(values) + + # -------------------------------------------------------------------- + + _index_shared_docs['_convert_scalar_indexer'] = """ + Convert a scalar indexer. + + Parameters + ---------- + key : label of the slice bound + kind : {'ix', 'loc', 'getitem', 'iloc'} or None + """ + + @Appender(_index_shared_docs['_convert_scalar_indexer']) + def _convert_scalar_indexer(self, key, kind=None): + assert kind in ['ix', 'loc', 'getitem', 'iloc', None] + + if kind == 'iloc': + return self._validate_indexer('positional', key, kind) + + if len(self) and not isinstance(self, ABCMultiIndex,): + + # we can raise here if we are definitive that this + # is positional indexing (eg. .ix on with a float) + # or label indexing if we are using a type able + # to be represented in the index + + if kind in ['getitem', 'ix'] and is_float(key): + if not self.is_floating(): + return self._invalid_indexer('label', key) + + elif kind in ['loc'] and is_float(key): + + # we want to raise KeyError on string/mixed here + # technically we *could* raise a TypeError + # on anything but mixed though + if self.inferred_type not in ['floating', + 'mixed-integer-float', + 'string', + 'unicode', + 'mixed']: + return self._invalid_indexer('label', key) + + elif kind in ['loc'] and is_integer(key): + if not self.holds_integer(): + return self._invalid_indexer('label', key) + + return key + + _index_shared_docs['_convert_slice_indexer'] = """ + Convert a slice indexer. + + By definition, these are labels unless 'iloc' is passed in. + Floats are not allowed as the start, step, or stop of the slice. + + Parameters ---------- key : label of the slice bound kind : {'ix', 'loc', 'getitem', 'iloc'} or None @@ -2229,73 +2677,6 @@ def _invalid_indexer(self, form, key): form=form, klass=type(self), key=key, kind=type(key))) - def get_duplicates(self): - """ - Extract duplicated index elements. - - Returns a sorted list of index elements which appear more than once in - the index. - - .. deprecated:: 0.23.0 - Use idx[idx.duplicated()].unique() instead - - Returns - ------- - array-like - List of duplicated indexes. - - See Also - -------- - Index.duplicated : Return boolean array denoting duplicates. - Index.drop_duplicates : Return Index with duplicates removed. - - Examples - -------- - - Works on different Index of types. - - >>> pd.Index([1, 2, 2, 3, 3, 3, 4]).get_duplicates() # doctest: +SKIP - [2, 3] - - Note that for a DatetimeIndex, it does not return a list but a new - DatetimeIndex: - - >>> dates = pd.to_datetime(['2018-01-01', '2018-01-02', '2018-01-03', - ... '2018-01-03', '2018-01-04', '2018-01-04'], - ... format='%Y-%m-%d') - >>> pd.Index(dates).get_duplicates() # doctest: +SKIP - DatetimeIndex(['2018-01-03', '2018-01-04'], - dtype='datetime64[ns]', freq=None) - - Sorts duplicated elements even when indexes are unordered. - - >>> pd.Index([1, 2, 3, 2, 3, 4, 3]).get_duplicates() # doctest: +SKIP - [2, 3] - - Return empty array-like structure when all elements are unique. - - >>> pd.Index([1, 2, 3, 4]).get_duplicates() # doctest: +SKIP - [] - >>> dates = pd.to_datetime(['2018-01-01', '2018-01-02', '2018-01-03'], - ... format='%Y-%m-%d') - >>> pd.Index(dates).get_duplicates() # doctest: +SKIP - DatetimeIndex([], dtype='datetime64[ns]', freq=None) - """ - warnings.warn("'get_duplicates' is deprecated and will be removed in " - "a future release. You can use " - "idx[idx.duplicated()].unique() instead", - FutureWarning, stacklevel=2) - - return self[self.duplicated()].unique() - - def _cleanup(self): - self._engine.clear_mapping() - - @cache_readonly - def _engine(self): - # property, for now, slow to look up - return self._engine_type(lambda: self._ndarray_values, len(self)) - def _is_memory_usage_qualified(self): """ Return a boolean if we need a qualified .info display. @@ -2305,47 +2686,6 @@ def _is_memory_usage_qualified(self): def is_type_compatible(self, kind): return kind == self.inferred_type - def __reduce__(self): - d = dict(data=self._data) - d.update(self._get_attributes_dict()) - return _new_Index, (self.__class__, d), None - - def __setstate__(self, state): - """ - Necessary for making this object picklable. - """ - - if isinstance(state, dict): - self._data = state.pop('data') - for k, v in compat.iteritems(state): - setattr(self, k, v) - - elif isinstance(state, tuple): - - if len(state) == 2: - nd_state, own_state = state - data = np.empty(nd_state[1], dtype=nd_state[2]) - np.ndarray.__setstate__(data, nd_state) - self.name = own_state[0] - - else: # pragma: no cover - data = np.empty(state) - np.ndarray.__setstate__(data, state) - - self._data = data - self._reset_identity() - else: - raise Exception("invalid pickle state") - - _unpickle_compat = __setstate__ - - def __nonzero__(self): - raise ValueError("The truth value of a {0} is ambiguous. " - "Use a.empty, a.bool(), a.item(), a.any() or a.all()." - .format(self.__class__.__name__)) - - __bool__ = __nonzero__ - _index_shared_docs['__contains__'] = """ Return a boolean if this key is IN the index. @@ -2473,152 +2813,16 @@ def _concat(self, to_concat, name): typs = _concat.get_dtype_kinds(to_concat) - if len(typs) == 1: - return self._concat_same_dtype(to_concat, name=name) - return _concat._concat_index_asobject(to_concat, name=name) - - def _concat_same_dtype(self, to_concat, name): - """ - Concatenate to_concat which has the same class. - """ - # must be overridden in specific classes - return _concat._concat_index_asobject(to_concat, name) - - @cache_readonly - def _isnan(self): - """ - Return if each value is NaN. - """ - if self._can_hold_na: - return isna(self) - else: - # shouldn't reach to this condition by checking hasnans beforehand - values = np.empty(len(self), dtype=np.bool_) - values.fill(False) - return values - - @cache_readonly - def _nan_idxs(self): - if self._can_hold_na: - w, = self._isnan.nonzero() - return w - else: - return np.array([], dtype=np.int64) - - @cache_readonly - def hasnans(self): - """ - Return if I have any nans; enables various perf speedups. - """ - if self._can_hold_na: - return bool(self._isnan.any()) - else: - return False - - def isna(self): - """ - Detect missing values. - - Return a boolean same-sized object indicating if the values are NA. - NA values, such as ``None``, :attr:`numpy.NaN` or :attr:`pd.NaT`, get - mapped to ``True`` values. - Everything else get mapped to ``False`` values. Characters such as - empty strings `''` or :attr:`numpy.inf` are not considered NA values - (unless you set ``pandas.options.mode.use_inf_as_na = True``). - - .. versionadded:: 0.20.0 - - Returns - ------- - numpy.ndarray - A boolean array of whether my values are NA - - See Also - -------- - pandas.Index.notna : Boolean inverse of isna. - pandas.Index.dropna : Omit entries with missing values. - pandas.isna : Top-level isna. - Series.isna : Detect missing values in Series object. - - Examples - -------- - Show which entries in a pandas.Index are NA. The result is an - array. - - >>> idx = pd.Index([5.2, 6.0, np.NaN]) - >>> idx - Float64Index([5.2, 6.0, nan], dtype='float64') - >>> idx.isna() - array([False, False, True], dtype=bool) - - Empty strings are not considered NA values. None is considered an NA - value. - - >>> idx = pd.Index(['black', '', 'red', None]) - >>> idx - Index(['black', '', 'red', None], dtype='object') - >>> idx.isna() - array([False, False, False, True], dtype=bool) - - For datetimes, `NaT` (Not a Time) is considered as an NA value. - - >>> idx = pd.DatetimeIndex([pd.Timestamp('1940-04-25'), - ... pd.Timestamp(''), None, pd.NaT]) - >>> idx - DatetimeIndex(['1940-04-25', 'NaT', 'NaT', 'NaT'], - dtype='datetime64[ns]', freq=None) - >>> idx.isna() - array([False, True, True, True], dtype=bool) - """ - return self._isnan - isnull = isna - - def notna(self): - """ - Detect existing (non-missing) values. - - Return a boolean same-sized object indicating if the values are not NA. - Non-missing values get mapped to ``True``. Characters such as empty - strings ``''`` or :attr:`numpy.inf` are not considered NA values - (unless you set ``pandas.options.mode.use_inf_as_na = True``). - NA values, such as None or :attr:`numpy.NaN`, get mapped to ``False`` - values. - - .. versionadded:: 0.20.0 - - Returns - ------- - numpy.ndarray - Boolean array to indicate which entries are not NA. - - See Also - -------- - Index.notnull : Alias of notna. - Index.isna: Inverse of notna. - pandas.notna : Top-level notna. - - Examples - -------- - Show which entries in an Index are not NA. The result is an - array. - - >>> idx = pd.Index([5.2, 6.0, np.NaN]) - >>> idx - Float64Index([5.2, 6.0, nan], dtype='float64') - >>> idx.notna() - array([ True, True, False]) - - Empty strings are not considered NA values. None is considered a NA - value. + if len(typs) == 1: + return self._concat_same_dtype(to_concat, name=name) + return _concat._concat_index_asobject(to_concat, name=name) - >>> idx = pd.Index(['black', '', 'red', None]) - >>> idx - Index(['black', '', 'red', None], dtype='object') - >>> idx.notna() - array([ True, True, True, False]) + def _concat_same_dtype(self, to_concat, name): """ - return ~self.isna() - notnull = notna + Concatenate to_concat which has the same class. + """ + # must be overridden in specific classes + return _concat._concat_index_asobject(to_concat, name) def putmask(self, mask, value): """ @@ -2930,6 +3134,9 @@ def argsort(self, *args, **kwargs): result = np.array(self) return result.argsort(*args, **kwargs) + # -------------------------------------------------------------------- + # Arithmetic & Logical Methods + def __add__(self, other): if isinstance(other, (ABCSeries, ABCDataFrame)): return NotImplemented @@ -2957,6 +3164,16 @@ def __or__(self, other): def __xor__(self, other): return self.symmetric_difference(other) + def __nonzero__(self): + raise ValueError("The truth value of a {0} is ambiguous. " + "Use a.empty, a.bool(), a.item(), a.any() or a.all()." + .format(self.__class__.__name__)) + + __bool__ = __nonzero__ + + # -------------------------------------------------------------------- + # Set Operation Methods + def _get_reconciled_name_object(self, other): """ If the result of a set operation will be self, @@ -3246,35 +3463,20 @@ def symmetric_difference(self, other, result_name=None): attribs['freq'] = None return self._shallow_copy_with_infer(the_diff, **attribs) - def _get_unique_index(self, dropna=False): - """ - Returns an index containing unique values. - - Parameters - ---------- - dropna : bool - If True, NaN values are dropped. - - Returns - ------- - uniques : index - """ - if self.is_unique and not dropna: - return self - - values = self.values - - if not self.is_unique: - values = self.unique() + def _assert_can_do_setop(self, other): + if not is_list_like(other): + raise TypeError('Input must be Index or array-like') + return True - if dropna: - try: - if self.hasnans: - values = values[~isna(values)] - except NotImplementedError: - pass + def _convert_can_do_setop(self, other): + if not isinstance(other, Index): + other = Index(other, name=self.name) + result_name = self.name + else: + result_name = get_op_result_name(self, other) + return other, result_name - return self._shallow_copy(values) + # -------------------------------------------------------------------- _index_shared_docs['get_loc'] = """ Get integer location, slice or boolean mask for requested label. @@ -3928,6 +4130,9 @@ def _reindex_non_unique(self, target): new_index = self._shallow_copy_with_infer(new_labels, freq=None) return new_index, indexer, new_indexer + # -------------------------------------------------------------------- + # Join Methods + _index_shared_docs['join'] = """ Compute join_index and indexers to conform data structures to the new index. @@ -4320,6 +4525,8 @@ def _wrap_joined_index(self, joined, other): name = get_op_result_name(self, other) return Index(joined, name=name) + # -------------------------------------------------------------------- + def _get_string_slice(self, key, use_lhs=True, use_rhs=True): # this is for partial string indexing, # overridden in DatetimeIndex, TimedeltaIndex and PeriodIndex @@ -4661,190 +4868,8 @@ def drop(self, labels, errors='raise'): indexer = indexer[~mask] return self.delete(indexer) - _index_shared_docs['index_unique'] = ( - """ - Return unique values in the index. Uniques are returned in order - of appearance, this does NOT sort. - - Parameters - ---------- - level : int or str, optional, default None - Only return values from specified level (for MultiIndex) - - .. versionadded:: 0.23.0 - - Returns - ------- - Index without duplicates - - See Also - -------- - unique - Series.unique - """) - - @Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs) - def unique(self, level=None): - if level is not None: - self._validate_index_level(level) - result = super(Index, self).unique() - return self._shallow_copy(result) - - def drop_duplicates(self, keep='first'): - """ - Return Index with duplicate values removed. - - Parameters - ---------- - keep : {'first', 'last', ``False``}, default 'first' - - 'first' : Drop duplicates except for the first occurrence. - - 'last' : Drop duplicates except for the last occurrence. - - ``False`` : Drop all duplicates. - - Returns - ------- - deduplicated : Index - - See Also - -------- - Series.drop_duplicates : Equivalent method on Series. - DataFrame.drop_duplicates : Equivalent method on DataFrame. - Index.duplicated : Related method on Index, indicating duplicate - Index values. - - Examples - -------- - Generate an pandas.Index with duplicate values. - - >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo']) - - The `keep` parameter controls which duplicate values are removed. - The value 'first' keeps the first occurrence for each - set of duplicated entries. The default value of keep is 'first'. - - >>> idx.drop_duplicates(keep='first') - Index(['lama', 'cow', 'beetle', 'hippo'], dtype='object') - - The value 'last' keeps the last occurrence for each set of duplicated - entries. - - >>> idx.drop_duplicates(keep='last') - Index(['cow', 'beetle', 'lama', 'hippo'], dtype='object') - - The value ``False`` discards all sets of duplicated entries. - - >>> idx.drop_duplicates(keep=False) - Index(['cow', 'beetle', 'hippo'], dtype='object') - """ - return super(Index, self).drop_duplicates(keep=keep) - - def duplicated(self, keep='first'): - """ - Indicate duplicate index values. - - Duplicated values are indicated as ``True`` values in the resulting - array. Either all duplicates, all except the first, or all except the - last occurrence of duplicates can be indicated. - - Parameters - ---------- - keep : {'first', 'last', False}, default 'first' - The value or values in a set of duplicates to mark as missing. - - - 'first' : Mark duplicates as ``True`` except for the first - occurrence. - - 'last' : Mark duplicates as ``True`` except for the last - occurrence. - - ``False`` : Mark all duplicates as ``True``. - - Examples - -------- - By default, for each set of duplicated values, the first occurrence is - set to False and all others to True: - - >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama']) - >>> idx.duplicated() - array([False, False, True, False, True]) - - which is equivalent to - - >>> idx.duplicated(keep='first') - array([False, False, True, False, True]) - - By using 'last', the last occurrence of each set of duplicated values - is set on False and all others on True: - - >>> idx.duplicated(keep='last') - array([ True, False, True, False, False]) - - By setting keep on ``False``, all duplicates are True: - - >>> idx.duplicated(keep=False) - array([ True, False, True, False, True]) - - Returns - ------- - numpy.ndarray - - See Also - -------- - pandas.Series.duplicated : Equivalent method on pandas.Series. - pandas.DataFrame.duplicated : Equivalent method on pandas.DataFrame. - pandas.Index.drop_duplicates : Remove duplicate values from Index. - """ - return super(Index, self).duplicated(keep=keep) - - _index_shared_docs['fillna'] = """ - Fill NA/NaN values with the specified value - - Parameters - ---------- - value : scalar - Scalar value to use to fill holes (e.g. 0). - This value cannot be a list-likes. - downcast : dict, default is None - a dict of item->dtype of what to downcast if possible, - or the string 'infer' which will try to downcast to an appropriate - equal type (e.g. float64 to int64 if possible) - - Returns - ------- - filled : %(klass)s - """ - - @Appender(_index_shared_docs['fillna']) - def fillna(self, value=None, downcast=None): - self._assert_can_do_op(value) - if self.hasnans: - result = self.putmask(self._isnan, value) - if downcast is None: - # no need to care metadata other than name - # because it can't have freq if - return Index(result, name=self.name) - return self._shallow_copy() - - _index_shared_docs['dropna'] = """ - Return Index without NA/NaN values - - Parameters - ---------- - how : {'any', 'all'}, default 'any' - If the Index is a MultiIndex, drop the value when any or all levels - are NaN. - - Returns - ------- - valid : Index - """ - - @Appender(_index_shared_docs['dropna']) - def dropna(self, how='any'): - if how not in ('any', 'all'): - raise ValueError("invalid how option: {0}".format(how)) - - if self.hasnans: - return self._shallow_copy(self.values[~self._isnan]) - return self._shallow_copy() + # -------------------------------------------------------------------- + # Generated Arithmetic, Comparison, and Unary Methods def _evaluate_with_timedelta_like(self, other, op): # Timedelta knows how to operate with np.array, so dispatch to that From 8fc934f4da767559349391ff736bb5bbb2d24342 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 27 Nov 2018 20:52:12 -0800 Subject: [PATCH 09/10] suggested classifications --- pandas/core/indexes/base.py | 223 +++++++++++++++++++----------------- 1 file changed, 116 insertions(+), 107 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c1df1cab5884a..5a120a8b9e0ec 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1283,6 +1283,47 @@ def _format_native_types(self, na_rep='', quoting=None, **kwargs): values[mask] = na_rep return values + def _summary(self, name=None): + """ + Return a summarized representation. + + Parameters + ---------- + name : str + name to use in the summary representation + + Returns + ------- + String with a summarized representation of the index + """ + if len(self) > 0: + head = self[0] + if (hasattr(head, 'format') and + not isinstance(head, compat.string_types)): + head = head.format() + tail = self[-1] + if (hasattr(tail, 'format') and + not isinstance(tail, compat.string_types)): + tail = tail.format() + index_summary = ', %s to %s' % (pprint_thing(head), + pprint_thing(tail)) + else: + index_summary = '' + + if name is None: + name = type(self).__name__ + return '%s: %s entries%s' % (name, len(self), index_summary) + + def summary(self, name=None): + """ + Return a summarized representation. + + .. deprecated:: 0.23.0 + """ + warnings.warn("'summary' is deprecated and will be removed in a " + "future version.", FutureWarning, stacklevel=2) + return self._summary(name) + # -------------------------------------------------------------------- # Conversion Methods @@ -1784,47 +1825,6 @@ def _has_complex_internals(self): # to disable groupby tricks in MultiIndex return False - def _summary(self, name=None): - """ - Return a summarized representation. - - Parameters - ---------- - name : str - name to use in the summary representation - - Returns - ------- - String with a summarized representation of the index - """ - if len(self) > 0: - head = self[0] - if (hasattr(head, 'format') and - not isinstance(head, compat.string_types)): - head = head.format() - tail = self[-1] - if (hasattr(tail, 'format') and - not isinstance(tail, compat.string_types)): - tail = tail.format() - index_summary = ', %s to %s' % (pprint_thing(head), - pprint_thing(tail)) - else: - index_summary = '' - - if name is None: - name = type(self).__name__ - return '%s: %s entries%s' % (name, len(self), index_summary) - - def summary(self, name=None): - """ - Return a summarized representation. - - .. deprecated:: 0.23.0 - """ - warnings.warn("'summary' is deprecated and will be removed in a " - "future version.", FutureWarning, stacklevel=2) - return self._summary(name) - # -------------------------------------------------------------------- # Introspection Methods @@ -2447,6 +2447,7 @@ def _get_unique_index(self, dropna=False): return self._shallow_copy(values) # -------------------------------------------------------------------- + # Indexer Conversion Methods _index_shared_docs['_convert_scalar_indexer'] = """ Convert a scalar indexer. @@ -2677,6 +2678,8 @@ def _invalid_indexer(self, form, key): form=form, klass=type(self), key=key, kind=type(key))) + # -------------------------------------------------------------------- + def _is_memory_usage_qualified(self): """ Return a boolean if we need a qualified .info display. @@ -3477,6 +3480,7 @@ def _convert_can_do_setop(self, other): return other, result_name # -------------------------------------------------------------------- + # Indexing Methods _index_shared_docs['get_loc'] = """ Get integer location, slice or boolean mask for requested label. @@ -3540,72 +3544,6 @@ def get_loc(self, key, method=None, tolerance=None): raise KeyError(key) return loc - def get_value(self, series, key): - """ - Fast lookup of value from 1-dimensional ndarray. Only use this if you - know what you're doing. - """ - - # if we have something that is Index-like, then - # use this, e.g. DatetimeIndex - s = getattr(series, '_values', None) - if isinstance(s, (ExtensionArray, Index)) and is_scalar(key): - # GH 20882, 21257 - # Unify Index and ExtensionArray treatment - # First try to convert the key to a location - # If that fails, raise a KeyError if an integer - # index, otherwise, see if key is an integer, and - # try that - try: - iloc = self.get_loc(key) - return s[iloc] - except KeyError: - if (len(self) > 0 and - (self.holds_integer() or self.is_boolean())): - raise - elif is_integer(key): - return s[key] - - s = com.values_from_object(series) - k = com.values_from_object(key) - - k = self._convert_scalar_indexer(k, kind='getitem') - try: - return self._engine.get_value(s, k, - tz=getattr(series.dtype, 'tz', None)) - except KeyError as e1: - if len(self) > 0 and (self.holds_integer() or self.is_boolean()): - raise - - try: - return libindex.get_value_box(s, key) - except IndexError: - raise - except TypeError: - # generator/iterator-like - if is_iterator(key): - raise InvalidIndexError(key) - else: - raise e1 - except Exception: # pragma: no cover - raise e1 - except TypeError: - # python 3 - if is_scalar(key): # pragma: no cover - raise IndexError(key) - raise InvalidIndexError(key) - - def set_value(self, arr, key, value): - """ - Fast lookup of value from 1-dimensional ndarray. - - Notes - ----- - Only use this if you know what you're doing. - """ - self._engine.set_value(com.values_from_object(arr), - com.values_from_object(key), value) - _index_shared_docs['get_indexer'] = """ Compute indexer and mask for new index given the current index. The indexer should be then used as an input to ndarray.take to align the @@ -3775,6 +3713,74 @@ def _filter_indexer_tolerance(self, target, indexer, tolerance): indexer = np.where(distance <= tolerance, indexer, -1) return indexer + # -------------------------------------------------------------------- + + def get_value(self, series, key): + """ + Fast lookup of value from 1-dimensional ndarray. Only use this if you + know what you're doing. + """ + + # if we have something that is Index-like, then + # use this, e.g. DatetimeIndex + s = getattr(series, '_values', None) + if isinstance(s, (ExtensionArray, Index)) and is_scalar(key): + # GH 20882, 21257 + # Unify Index and ExtensionArray treatment + # First try to convert the key to a location + # If that fails, raise a KeyError if an integer + # index, otherwise, see if key is an integer, and + # try that + try: + iloc = self.get_loc(key) + return s[iloc] + except KeyError: + if (len(self) > 0 and + (self.holds_integer() or self.is_boolean())): + raise + elif is_integer(key): + return s[key] + + s = com.values_from_object(series) + k = com.values_from_object(key) + + k = self._convert_scalar_indexer(k, kind='getitem') + try: + return self._engine.get_value(s, k, + tz=getattr(series.dtype, 'tz', None)) + except KeyError as e1: + if len(self) > 0 and (self.holds_integer() or self.is_boolean()): + raise + + try: + return libindex.get_value_box(s, key) + except IndexError: + raise + except TypeError: + # generator/iterator-like + if is_iterator(key): + raise InvalidIndexError(key) + else: + raise e1 + except Exception: # pragma: no cover + raise e1 + except TypeError: + # python 3 + if is_scalar(key): # pragma: no cover + raise IndexError(key) + raise InvalidIndexError(key) + + def set_value(self, arr, key, value): + """ + Fast lookup of value from 1-dimensional ndarray. + + Notes + ----- + Only use this if you know what you're doing. + """ + self._engine.set_value(com.values_from_object(arr), + com.values_from_object(key), value) + _index_shared_docs['get_indexer_non_unique'] = """ Compute indexer and mask for new index given the current index. The indexer should be then used as an input to ndarray.take to align the @@ -3995,6 +4001,9 @@ def isin(self, values, level=None): self._validate_index_level(level) return algos.isin(self, values) + # -------------------------------------------------------------------- + # Reindex Methods + def _can_reindex(self, indexer): """ Check if we are allowing reindexing with this particular indexer. From a70b3199ac1cb3e3af45d6f3734d6bf4a5010d51 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Wed, 28 Nov 2018 06:51:36 -0800 Subject: [PATCH 10/10] collect uncategorized methods at the bottom --- pandas/core/indexes/base.py | 3791 +++++++++++++++++------------------ 1 file changed, 1892 insertions(+), 1899 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5a120a8b9e0ec..e8a2dd4879f20 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -526,6 +526,7 @@ def _constructor(self): return type(self) # -------------------------------------------------------------------- + # Index Internals Methods def _get_attributes_dict(self): """ @@ -822,7 +823,6 @@ def _assert_take_fillable(self, values, indices, allow_fill=True, taken = values.take(indices) return taken - # ops compat def repeat(self, repeats, *args, **kwargs): """ Repeat elements of an Index. @@ -861,230 +861,6 @@ def repeat(self, repeats, *args, **kwargs): nv.validate_repeat(args, kwargs) return self._shallow_copy(self._values.repeat(repeats)) - # -------------------------------------------------------------------- - - @property - def values(self): - """ - Return the underlying data as an ndarray. - """ - return self._data.view(np.ndarray) - - @property - def _values(self): - # type: () -> Union[ExtensionArray, Index, np.ndarray] - # TODO(EA): remove index types as they become extension arrays - """ - The best array representation. - - This is an ndarray, ExtensionArray, or Index subclass. This differs - from ``_ndarray_values``, which always returns an ndarray. - - Both ``_values`` and ``_ndarray_values`` are consistent between - ``Series`` and ``Index``. - - It may differ from the public '.values' method. - - index | values | _values | _ndarray_values | - ----------------- | --------------- | ------------- | --------------- | - Index | ndarray | ndarray | ndarray | - CategoricalIndex | Categorical | Categorical | ndarray[int] | - DatetimeIndex | ndarray[M8ns] | ndarray[M8ns] | ndarray[M8ns] | - DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] | ndarray[M8ns] | - PeriodIndex | ndarray[object] | PeriodArray | ndarray[int] | - IntervalIndex | IntervalArray | IntervalArray | ndarray[object] | - - See Also - -------- - values - _ndarray_values - """ - return self.values - - def get_values(self): - """ - Return `Index` data as an `numpy.ndarray`. - - Returns - ------- - numpy.ndarray - A one-dimensional numpy array of the `Index` values. - - See Also - -------- - Index.values : The attribute that get_values wraps. - - Examples - -------- - Getting the `Index` values of a `DataFrame`: - - >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - ... index=['a', 'b', 'c'], columns=['A', 'B', 'C']) - >>> df - A B C - a 1 2 3 - b 4 5 6 - c 7 8 9 - >>> df.index.get_values() - array(['a', 'b', 'c'], dtype=object) - - Standalone `Index` values: - - >>> idx = pd.Index(['1', '2', '3']) - >>> idx.get_values() - array(['1', '2', '3'], dtype=object) - - `MultiIndex` arrays also have only one dimension: - - >>> midx = pd.MultiIndex.from_arrays([[1, 2, 3], ['a', 'b', 'c']], - ... names=('number', 'letter')) - >>> midx.get_values() - array([(1, 'a'), (2, 'b'), (3, 'c')], dtype=object) - >>> midx.get_values().ndim - 1 - """ - return self.values - - @Appender(IndexOpsMixin.memory_usage.__doc__) - def memory_usage(self, deep=False): - result = super(Index, self).memory_usage(deep=deep) - - # include our engine hashtable - result += self._engine.sizeof(deep=deep) - return result - - _index_shared_docs['where'] = """ - Return an Index of same shape as self and whose corresponding - entries are from self where cond is True and otherwise are from - other. - - .. versionadded:: 0.19.0 - - Parameters - ---------- - cond : boolean array-like with the same length as self - other : scalar, or array-like - """ - - @Appender(_index_shared_docs['where']) - def where(self, cond, other=None): - if other is None: - other = self._na_value - - dtype = self.dtype - values = self.values - - if is_bool(other) or is_bool_dtype(other): - - # bools force casting - values = values.astype(object) - dtype = None - - values = np.where(cond, values, other) - - if self._is_numeric_dtype and np.any(isna(values)): - # We can't coerce to the numeric dtype of "self" (unless - # it's float) if there are NaN values in our output. - dtype = None - - return self._shallow_copy_with_infer(values, dtype=dtype) - - # construction helpers - @classmethod - def _try_convert_to_int_index(cls, data, copy, name, dtype): - """ - Attempt to convert an array of data into an integer index. - - Parameters - ---------- - data : The data to convert. - copy : Whether to copy the data or not. - name : The name of the index returned. - - Returns - ------- - int_index : data converted to either an Int64Index or a - UInt64Index - - Raises - ------ - ValueError if the conversion was not successful. - """ - - from .numeric import Int64Index, UInt64Index - if not is_unsigned_integer_dtype(dtype): - # skip int64 conversion attempt if uint-like dtype is passed, as - # this could return Int64Index when UInt64Index is what's desrired - try: - res = data.astype('i8', copy=False) - if (res == data).all(): - return Int64Index(res, copy=copy, name=name) - except (OverflowError, TypeError, ValueError): - pass - - # Conversion to int64 failed (possibly due to overflow) or was skipped, - # so let's try now with uint64. - try: - res = data.astype('u8', copy=False) - if (res == data).all(): - return UInt64Index(res, copy=copy, name=name) - except (OverflowError, TypeError, ValueError): - pass - - raise ValueError - - @classmethod - def _scalar_data_error(cls, data): - raise TypeError('{0}(...) must be called with a collection of some ' - 'kind, {1} was passed'.format(cls.__name__, - repr(data))) - - @classmethod - def _string_data_error(cls, data): - raise TypeError('String dtype not supported, you may need ' - 'to explicitly cast to a numeric type') - - @classmethod - def _coerce_to_ndarray(cls, data): - """ - Coerces data to ndarray. - - Converts other iterables to list first and then to array. - Does not touch ndarrays. - - Raises - ------ - TypeError - When the data passed in is a scalar. - """ - - if not isinstance(data, (np.ndarray, Index)): - if data is None or is_scalar(data): - cls._scalar_data_error(data) - - # other iterable of some kind - if not isinstance(data, (ABCSeries, list, tuple)): - data = list(data) - data = np.asarray(data) - return data - - def _coerce_scalar_to_index(self, item): - """ - We need to coerce a scalar to a compat for our index type. - - Parameters - ---------- - item : scalar item to coerce - """ - dtype = self.dtype - - if self._is_numeric_dtype and isna(item): - # We can't coerce to the numeric dtype of "self" (unless - # it's float) if there are NaN values in our output. - dtype = None - - return Index([item], dtype=dtype, **self._get_attributes_dict()) - # -------------------------------------------------------------------- # Copying Methods @@ -1798,33 +1574,6 @@ def _get_grouper_for_level(self, mapper, level=None): return grouper, None, None - # -------------------------------------------------------------------- - - def _to_safe_for_reshape(self): - """ - Convert to object if we are a categorical. - """ - return self - - def _convert_for_op(self, value): - """ - Convert value to be insertable to ndarray. - """ - return value - - def _assert_can_do_op(self, value): - """ - Check value is valid for scalar op. - """ - if not is_scalar(value): - msg = "'value' must be a scalar, passed: {0}" - raise TypeError(msg.format(type(value).__name__)) - - @property - def _has_complex_internals(self): - # to disable groupby tricks in MultiIndex - return False - # -------------------------------------------------------------------- # Introspection Methods @@ -2447,2094 +2196,2338 @@ def _get_unique_index(self, dropna=False): return self._shallow_copy(values) # -------------------------------------------------------------------- - # Indexer Conversion Methods - - _index_shared_docs['_convert_scalar_indexer'] = """ - Convert a scalar indexer. + # Arithmetic & Logical Methods - Parameters - ---------- - key : label of the slice bound - kind : {'ix', 'loc', 'getitem', 'iloc'} or None - """ + def __add__(self, other): + if isinstance(other, (ABCSeries, ABCDataFrame)): + return NotImplemented + return Index(np.array(self) + other) - @Appender(_index_shared_docs['_convert_scalar_indexer']) - def _convert_scalar_indexer(self, key, kind=None): - assert kind in ['ix', 'loc', 'getitem', 'iloc', None] + def __radd__(self, other): + return Index(other + np.array(self)) - if kind == 'iloc': - return self._validate_indexer('positional', key, kind) + def __iadd__(self, other): + # alias for __add__ + return self + other - if len(self) and not isinstance(self, ABCMultiIndex,): + def __sub__(self, other): + return Index(np.array(self) - other) - # we can raise here if we are definitive that this - # is positional indexing (eg. .ix on with a float) - # or label indexing if we are using a type able - # to be represented in the index + def __rsub__(self, other): + return Index(other - np.array(self)) - if kind in ['getitem', 'ix'] and is_float(key): - if not self.is_floating(): - return self._invalid_indexer('label', key) + def __and__(self, other): + return self.intersection(other) - elif kind in ['loc'] and is_float(key): + def __or__(self, other): + return self.union(other) - # we want to raise KeyError on string/mixed here - # technically we *could* raise a TypeError - # on anything but mixed though - if self.inferred_type not in ['floating', - 'mixed-integer-float', - 'string', - 'unicode', - 'mixed']: - return self._invalid_indexer('label', key) + def __xor__(self, other): + return self.symmetric_difference(other) - elif kind in ['loc'] and is_integer(key): - if not self.holds_integer(): - return self._invalid_indexer('label', key) + def __nonzero__(self): + raise ValueError("The truth value of a {0} is ambiguous. " + "Use a.empty, a.bool(), a.item(), a.any() or a.all()." + .format(self.__class__.__name__)) - return key + __bool__ = __nonzero__ - _index_shared_docs['_convert_slice_indexer'] = """ - Convert a slice indexer. + # -------------------------------------------------------------------- + # Set Operation Methods - By definition, these are labels unless 'iloc' is passed in. - Floats are not allowed as the start, step, or stop of the slice. + def _get_reconciled_name_object(self, other): + """ + If the result of a set operation will be self, + return self, unless the name changes, in which + case make a shallow copy of self. + """ + name = get_op_result_name(self, other) + if self.name != name: + return self._shallow_copy(name=name) + return self + + def union(self, other): + """ + Form the union of two Index objects and sorts if possible. Parameters ---------- - key : label of the slice bound - kind : {'ix', 'loc', 'getitem', 'iloc'} or None - """ - - @Appender(_index_shared_docs['_convert_slice_indexer']) - def _convert_slice_indexer(self, key, kind=None): - assert kind in ['ix', 'loc', 'getitem', 'iloc', None] - - # if we are not a slice, then we are done - if not isinstance(key, slice): - return key - - # validate iloc - if kind == 'iloc': - return slice(self._validate_indexer('slice', key.start, kind), - self._validate_indexer('slice', key.stop, kind), - self._validate_indexer('slice', key.step, kind)) + other : Index or array-like - # potentially cast the bounds to integers - start, stop, step = key.start, key.stop, key.step + Returns + ------- + union : Index - # figure out if this is a positional indexer - def is_int(v): - return v is None or is_integer(v) + Examples + -------- - is_null_slicer = start is None and stop is None - is_index_slice = is_int(start) and is_int(stop) - is_positional = is_index_slice and not self.is_integer() + >>> idx1 = pd.Index([1, 2, 3, 4]) + >>> idx2 = pd.Index([3, 4, 5, 6]) + >>> idx1.union(idx2) + Int64Index([1, 2, 3, 4, 5, 6], dtype='int64') + """ + self._assert_can_do_setop(other) + other = ensure_index(other) - if kind == 'getitem': - """ - called from the getitem slicers, validate that we are in fact - integers - """ - if self.is_integer() or is_index_slice: - return slice(self._validate_indexer('slice', key.start, kind), - self._validate_indexer('slice', key.stop, kind), - self._validate_indexer('slice', key.step, kind)) + if len(other) == 0 or self.equals(other): + return self._get_reconciled_name_object(other) - # convert the slice to an indexer here + if len(self) == 0: + return other._get_reconciled_name_object(self) - # if we are mixed and have integers - try: - if is_positional and self.is_mixed(): - # Validate start & stop - if start is not None: - self.get_loc(start) - if stop is not None: - self.get_loc(stop) - is_positional = False - except KeyError: - if self.inferred_type == 'mixed-integer-float': - raise + # TODO: is_dtype_union_equal is a hack around + # 1. buggy set ops with duplicates (GH #13432) + # 2. CategoricalIndex lacking setops (GH #10186) + # Once those are fixed, this workaround can be removed + if not is_dtype_union_equal(self.dtype, other.dtype): + this = self.astype('O') + other = other.astype('O') + return this.union(other) - if is_null_slicer: - indexer = key - elif is_positional: - indexer = key + # TODO(EA): setops-refactor, clean all this up + if is_period_dtype(self) or is_datetime64tz_dtype(self): + lvals = self._ndarray_values + else: + lvals = self._values + if is_period_dtype(other) or is_datetime64tz_dtype(other): + rvals = other._ndarray_values else: + rvals = other._values + + if self.is_monotonic and other.is_monotonic: try: - indexer = self.slice_indexer(start, stop, step, kind=kind) - except Exception: - if is_index_slice: - if self.is_integer(): - raise - else: - indexer = key - else: - raise + result = self._outer_indexer(lvals, rvals)[0] + except TypeError: + # incomparable objects + result = list(lvals) - return indexer + # worth making this faster? a very unusual case + value_set = set(lvals) + result.extend([x for x in rvals if x not in value_set]) + else: + indexer = self.get_indexer(other) + indexer, = (indexer == -1).nonzero() - def _convert_listlike_indexer(self, keyarr, kind=None): - """ - Parameters - ---------- - keyarr : list-like - Indexer to convert. + if len(indexer) > 0: + other_diff = algos.take_nd(rvals, indexer, + allow_fill=False) + result = _concat._concat_compat((lvals, other_diff)) - Returns - ------- - tuple (indexer, keyarr) - indexer is an ndarray or None if cannot convert - keyarr are tuple-safe keys - """ - if isinstance(keyarr, Index): - keyarr = self._convert_index_indexer(keyarr) - else: - keyarr = self._convert_arr_indexer(keyarr) + try: + lvals[0] < other_diff[0] + except TypeError as e: + warnings.warn("%s, sort order is undefined for " + "incomparable objects" % e, RuntimeWarning, + stacklevel=3) + else: + types = frozenset((self.inferred_type, + other.inferred_type)) + if not types & _unsortable_types: + result.sort() - indexer = self._convert_list_indexer(keyarr, kind=kind) - return indexer, keyarr + else: + result = lvals - _index_shared_docs['_convert_arr_indexer'] = """ - Convert an array-like indexer to the appropriate dtype. + try: + result = np.sort(result) + except TypeError as e: + warnings.warn("%s, sort order is undefined for " + "incomparable objects" % e, RuntimeWarning, + stacklevel=3) - Parameters - ---------- - keyarr : array-like - Indexer to convert. + # for subclasses + return self._wrap_setop_result(other, result) - Returns - ------- - converted_keyarr : array-like - """ + def _wrap_setop_result(self, other, result): + return self._constructor(result, name=get_op_result_name(self, other)) - @Appender(_index_shared_docs['_convert_arr_indexer']) - def _convert_arr_indexer(self, keyarr): - keyarr = com.asarray_tuplesafe(keyarr) - return keyarr + def intersection(self, other): + """ + Form the intersection of two Index objects. - _index_shared_docs['_convert_index_indexer'] = """ - Convert an Index indexer to the appropriate dtype. + This returns a new Index with elements common to the index and `other`, + preserving the order of the calling index. Parameters ---------- - keyarr : Index (or sub-class) - Indexer to convert. + other : Index or array-like Returns ------- - converted_keyarr : Index (or sub-class) - """ + intersection : Index - @Appender(_index_shared_docs['_convert_index_indexer']) - def _convert_index_indexer(self, keyarr): - return keyarr - - _index_shared_docs['_convert_list_indexer'] = """ - Convert a list-like indexer to the appropriate dtype. - - Parameters - ---------- - keyarr : Index (or sub-class) - Indexer to convert. - kind : iloc, ix, loc, optional + Examples + -------- - Returns - ------- - positional indexer or None - """ + >>> idx1 = pd.Index([1, 2, 3, 4]) + >>> idx2 = pd.Index([3, 4, 5, 6]) + >>> idx1.intersection(idx2) + Int64Index([3, 4], dtype='int64') + """ + self._assert_can_do_setop(other) + other = ensure_index(other) - @Appender(_index_shared_docs['_convert_list_indexer']) - def _convert_list_indexer(self, keyarr, kind=None): - if (kind in [None, 'iloc', 'ix'] and - is_integer_dtype(keyarr) and not self.is_floating() and - not isinstance(keyarr, ABCPeriodIndex)): + if self.equals(other): + return self._get_reconciled_name_object(other) - if self.inferred_type == 'mixed-integer': - indexer = self.get_indexer(keyarr) - if (indexer >= 0).all(): - return indexer - # missing values are flagged as -1 by get_indexer and negative - # indices are already converted to positive indices in the - # above if-statement, so the negative flags are changed to - # values outside the range of indices so as to trigger an - # IndexError in maybe_convert_indices - indexer[indexer < 0] = len(self) - from pandas.core.indexing import maybe_convert_indices - return maybe_convert_indices(indexer, len(self)) + if not is_dtype_equal(self.dtype, other.dtype): + this = self.astype('O') + other = other.astype('O') + return this.intersection(other) - elif not self.inferred_type == 'integer': - keyarr = np.where(keyarr < 0, len(self) + keyarr, keyarr) - return keyarr + # TODO(EA): setops-refactor, clean all this up + if is_period_dtype(self): + lvals = self._ndarray_values + else: + lvals = self._values + if is_period_dtype(other): + rvals = other._ndarray_values + else: + rvals = other._values - return None + if self.is_monotonic and other.is_monotonic: + try: + result = self._inner_indexer(lvals, rvals)[0] + return self._wrap_setop_result(other, result) + except TypeError: + pass - def _invalid_indexer(self, form, key): - """ - Consistent invalid indexer message. - """ - raise TypeError("cannot do {form} indexing on {klass} with these " - "indexers [{key}] of {kind}".format( - form=form, klass=type(self), key=key, - kind=type(key))) + try: + indexer = Index(rvals).get_indexer(lvals) + indexer = indexer.take((indexer != -1).nonzero()[0]) + except Exception: + # duplicates + indexer = algos.unique1d( + Index(rvals).get_indexer_non_unique(lvals)[0]) + indexer = indexer[indexer != -1] - # -------------------------------------------------------------------- + taken = other.take(indexer) + if self.name != other.name: + taken.name = None + return taken - def _is_memory_usage_qualified(self): - """ - Return a boolean if we need a qualified .info display. + def difference(self, other, sort=True): """ - return self.is_object() - - def is_type_compatible(self, kind): - return kind == self.inferred_type + Return a new Index with elements from the index that are not in + `other`. - _index_shared_docs['__contains__'] = """ - Return a boolean if this key is IN the index. + This is the set difference of two Index objects. Parameters ---------- - key : object - - Returns - ------- - boolean - """ - - @Appender(_index_shared_docs['__contains__'] % _index_doc_kwargs) - def __contains__(self, key): - hash(key) - try: - return key in self._engine - except (OverflowError, TypeError, ValueError): - return False - - _index_shared_docs['contains'] = """ - Return a boolean if this key is IN the index. + other : Index or array-like + sort : bool, default True + Sort the resulting index if possible - Parameters - ---------- - key : object + .. versionadded:: 0.24.0 Returns ------- - boolean - """ - - @Appender(_index_shared_docs['contains'] % _index_doc_kwargs) - def contains(self, key): - hash(key) - try: - return key in self._engine - except (TypeError, ValueError): - return False - - def __hash__(self): - raise TypeError("unhashable type: %r" % type(self).__name__) + difference : Index - def __setitem__(self, key, value): - raise TypeError("Index does not support mutable operations") + Examples + -------- - def __getitem__(self, key): + >>> idx1 = pd.Index([2, 1, 3, 4]) + >>> idx2 = pd.Index([3, 4, 5, 6]) + >>> idx1.difference(idx2) + Int64Index([1, 2], dtype='int64') + >>> idx1.difference(idx2, sort=False) + Int64Index([2, 1], dtype='int64') """ - Override numpy.ndarray's __getitem__ method to work as desired. - - This function adds lists and Series as valid boolean indexers - (ndarrays only supports ndarray with dtype=bool). - - If resulting ndim != 1, plain ndarray is returned instead of - corresponding `Index` subclass. + self._assert_can_do_setop(other) - """ - # There's no custom logic to be implemented in __getslice__, so it's - # not overloaded intentionally. - getitem = self._data.__getitem__ - promote = self._shallow_copy + if self.equals(other): + # pass an empty np.ndarray with the appropriate dtype + return self._shallow_copy(self._data[:0]) - if is_scalar(key): - key = com.cast_scalar_indexer(key) - return getitem(key) + other, result_name = self._convert_can_do_setop(other) - if isinstance(key, slice): - # This case is separated from the conditional above to avoid - # pessimization of basic indexing. - return promote(getitem(key)) + this = self._get_unique_index() - if com.is_bool_indexer(key): - key = np.asarray(key, dtype=bool) + indexer = this.get_indexer(other) + indexer = indexer.take((indexer != -1).nonzero()[0]) - key = com.values_from_object(key) - result = getitem(key) - if not is_scalar(result): - return promote(result) - else: - return result + label_diff = np.setdiff1d(np.arange(this.size), indexer, + assume_unique=True) + the_diff = this.values.take(label_diff) + if sort: + try: + the_diff = sorting.safe_sort(the_diff) + except TypeError: + pass - def _can_hold_identifiers_and_holds_name(self, name): - """ - Faster check for ``name in self`` when we know `name` is a Python - identifier (e.g. in NDFrame.__getattr__, which hits this to support - . key lookup). For indexes that can't hold identifiers (everything - but object & categorical) we just return False. + return this._shallow_copy(the_diff, name=result_name, freq=None) - https://github.com/pandas-dev/pandas/issues/19764 + def symmetric_difference(self, other, result_name=None): """ - if self.is_object() or self.is_categorical(): - return name in self - return False + Compute the symmetric difference of two Index objects. - def append(self, other): - """ - Append a collection of Index options together. + It's sorted if sorting is possible. Parameters ---------- - other : Index or list/tuple of indices + other : Index or array-like + result_name : str Returns ------- - appended : Index - """ - - to_concat = [self] + symmetric_difference : Index - if isinstance(other, (list, tuple)): - to_concat = to_concat + list(other) - else: - to_concat.append(other) + Notes + ----- + ``symmetric_difference`` contains elements that appear in either + ``idx1`` or ``idx2`` but not both. Equivalent to the Index created by + ``idx1.difference(idx2) | idx2.difference(idx1)`` with duplicates + dropped. - for obj in to_concat: - if not isinstance(obj, Index): - raise TypeError('all inputs must be Index') + Examples + -------- + >>> idx1 = pd.Index([1, 2, 3, 4]) + >>> idx2 = pd.Index([2, 3, 4, 5]) + >>> idx1.symmetric_difference(idx2) + Int64Index([1, 5], dtype='int64') - names = {obj.name for obj in to_concat} - name = None if len(names) > 1 else self.name + You can also use the ``^`` operator: - return self._concat(to_concat, name) + >>> idx1 ^ idx2 + Int64Index([1, 5], dtype='int64') + """ + self._assert_can_do_setop(other) + other, result_name_update = self._convert_can_do_setop(other) + if result_name is None: + result_name = result_name_update - def _concat(self, to_concat, name): + this = self._get_unique_index() + other = other._get_unique_index() + indexer = this.get_indexer(other) - typs = _concat.get_dtype_kinds(to_concat) + # {this} minus {other} + common_indexer = indexer.take((indexer != -1).nonzero()[0]) + left_indexer = np.setdiff1d(np.arange(this.size), common_indexer, + assume_unique=True) + left_diff = this.values.take(left_indexer) - if len(typs) == 1: - return self._concat_same_dtype(to_concat, name=name) - return _concat._concat_index_asobject(to_concat, name=name) - - def _concat_same_dtype(self, to_concat, name): - """ - Concatenate to_concat which has the same class. - """ - # must be overridden in specific classes - return _concat._concat_index_asobject(to_concat, name) - - def putmask(self, mask, value): - """ - Return a new Index of the values set with the mask. + # {other} minus {this} + right_indexer = (indexer == -1).nonzero()[0] + right_diff = other.values.take(right_indexer) - See Also - -------- - numpy.ndarray.putmask - """ - values = self.values.copy() + the_diff = _concat._concat_compat([left_diff, right_diff]) try: - np.putmask(values, mask, self._convert_for_op(value)) - return self._shallow_copy(values) - except (ValueError, TypeError) as err: - if is_object_dtype(self): - raise err + the_diff = sorting.safe_sort(the_diff) + except TypeError: + pass - # coerces to object - return self.astype(object).putmask(mask, value) + attribs = self._get_attributes_dict() + attribs['name'] = result_name + if 'freq' in attribs: + attribs['freq'] = None + return self._shallow_copy_with_infer(the_diff, **attribs) - def equals(self, other): - """ - Determines if two Index objects contain the same elements. - """ - if self.is_(other): - return True + def _assert_can_do_setop(self, other): + if not is_list_like(other): + raise TypeError('Input must be Index or array-like') + return True + def _convert_can_do_setop(self, other): if not isinstance(other, Index): - return False - - if is_object_dtype(self) and not is_object_dtype(other): - # if other is not object, use other's logic for coercion - return other.equals(self) - - try: - return array_equivalent(com.values_from_object(self), - com.values_from_object(other)) - except Exception: - return False - - def identical(self, other): - """ - Similar to equals, but check that other comparable attributes are - also equal. - """ - return (self.equals(other) and - all((getattr(self, c, None) == getattr(other, c, None) - for c in self._comparables)) and - type(self) == type(other)) + other = Index(other, name=self.name) + result_name = self.name + else: + result_name = get_op_result_name(self, other) + return other, result_name - def asof(self, label): - """ - Return the label from the index, or, if not present, the previous one. + # -------------------------------------------------------------------- + # Indexing Methods - Assuming that the index is sorted, return the passed index label if it - is in the index, or return the previous index label if the passed one - is not in the index. + _index_shared_docs['get_loc'] = """ + Get integer location, slice or boolean mask for requested label. Parameters ---------- - label : object - The label up to which the method returns the latest index label. + key : label + method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional + * default: exact matches only. + * pad / ffill: find the PREVIOUS index value if no exact match. + * backfill / bfill: use NEXT index value if no exact match + * nearest: use the NEAREST index value if no exact match. Tied + distances are broken by preferring the larger index value. + tolerance : optional + Maximum distance from index value for inexact matches. The value of + the index at the matching location most satisfy the equation + ``abs(index[loc] - key) <= tolerance``. + + Tolerance may be a scalar + value, which applies the same tolerance to all values, or + list-like, which applies variable tolerance per element. List-like + includes list, tuple, array, Series, and must be the same size as + the index and its dtype must exactly match the index's type. + + .. versionadded:: 0.21.0 (list-like tolerance) Returns ------- - object - The passed label if it is in the index. The previous label if the - passed label is not in the sorted index or `NaN` if there is no - such label. - - See Also - -------- - Series.asof : Return the latest value in a Series up to the - passed index. - merge_asof : Perform an asof merge (similar to left join but it - matches on nearest key rather than equal key). - Index.get_loc : An `asof` is a thin wrapper around `get_loc` - with method='pad'. + loc : int if unique index, slice if monotonic index, else mask Examples - -------- - `Index.asof` returns the latest index label up to the passed label. - - >>> idx = pd.Index(['2013-12-31', '2014-01-02', '2014-01-03']) - >>> idx.asof('2014-01-01') - '2013-12-31' - - If the label is in the index, the method returns the passed label. - - >>> idx.asof('2014-01-02') - '2014-01-02' - - If all of the labels in the index are later than the passed label, - NaN is returned. - - >>> idx.asof('1999-01-02') - nan - - If the index is not sorted, an error is raised. + --------- + >>> unique_index = pd.Index(list('abc')) + >>> unique_index.get_loc('b') + 1 - >>> idx_not_sorted = pd.Index(['2013-12-31', '2015-01-02', - ... '2014-01-03']) - >>> idx_not_sorted.asof('2013-12-31') - Traceback (most recent call last): - ValueError: index must be monotonic increasing or decreasing - """ - try: - loc = self.get_loc(label, method='pad') - except KeyError: - return self._na_value - else: - if isinstance(loc, slice): - loc = loc.indices(len(self))[-1] - return self[loc] + >>> monotonic_index = pd.Index(list('abbc')) + >>> monotonic_index.get_loc('b') + slice(1, 3, None) - def asof_locs(self, where, mask): + >>> non_monotonic_index = pd.Index(list('abcb')) + >>> non_monotonic_index.get_loc('b') + array([False, True, False, True], dtype=bool) """ - Finds the locations (indices) of the labels from the index for - every entry in the `where` argument. - - As in the `asof` function, if the label (a particular entry in - `where`) is not in the index, the latest index label upto the - passed label is chosen and its index returned. - If all of the labels in the index are later than a label in `where`, - -1 is returned. + @Appender(_index_shared_docs['get_loc']) + def get_loc(self, key, method=None, tolerance=None): + if method is None: + if tolerance is not None: + raise ValueError('tolerance argument only valid if using pad, ' + 'backfill or nearest lookups') + try: + return self._engine.get_loc(key) + except KeyError: + return self._engine.get_loc(self._maybe_cast_indexer(key)) + indexer = self.get_indexer([key], method=method, tolerance=tolerance) + if indexer.ndim > 1 or indexer.size > 1: + raise TypeError('get_loc requires scalar valued input') + loc = indexer.item() + if loc == -1: + raise KeyError(key) + return loc - `mask` is used to ignore NA values in the index during calculation. + _index_shared_docs['get_indexer'] = """ + Compute indexer and mask for new index given the current index. The + indexer should be then used as an input to ndarray.take to align the + current data to the new index. Parameters ---------- - where : Index - An Index consisting of an array of timestamps. - mask : array-like - Array of booleans denoting where values in the original - data are not NA. + target : %(target_klass)s + method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional + * default: exact matches only. + * pad / ffill: find the PREVIOUS index value if no exact match. + * backfill / bfill: use NEXT index value if no exact match + * nearest: use the NEAREST index value if no exact match. Tied + distances are broken by preferring the larger index value. + limit : int, optional + Maximum number of consecutive labels in ``target`` to match for + inexact matches. + tolerance : optional + Maximum distance between original and new labels for inexact + matches. The values of the index at the matching locations most + satisfy the equation ``abs(index[indexer] - target) <= tolerance``. - Returns - ------- - numpy.ndarray - An array of locations (indices) of the labels from the Index - which correspond to the return values of the `asof` function - for every element in `where`. - """ - locs = self.values[mask].searchsorted(where.values, side='right') - locs = np.where(locs > 0, locs - 1, 0) + Tolerance may be a scalar value, which applies the same tolerance + to all values, or list-like, which applies variable tolerance per + element. List-like includes list, tuple, array, Series, and must be + the same size as the index and its dtype must exactly match the + index's type. - result = np.arange(len(self))[mask].take(locs) + .. versionadded:: 0.21.0 (list-like tolerance) - first = mask.argmax() - result[(locs == 0) & (where.values < self.values[first])] = -1 + Returns + ------- + indexer : ndarray of int + Integers from 0 to n - 1 indicating that the index at these + positions matches the corresponding target values. Missing values + in the target are marked by -1. - return result + Examples + -------- + >>> index = pd.Index(['c', 'a', 'b']) + >>> index.get_indexer(['a', 'b', 'x']) + array([ 1, 2, -1]) - def sort_values(self, return_indexer=False, ascending=True): + Notice that the return value is an array of locations in ``index`` + and ``x`` is marked by -1, as it is not in ``index``. """ - Return a sorted copy of the index. - Return a sorted copy of the index, and optionally return the indices - that sorted the index itself. + @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) + def get_indexer(self, target, method=None, limit=None, tolerance=None): + method = missing.clean_reindex_fill_method(method) + target = ensure_index(target) + if tolerance is not None: + tolerance = self._convert_tolerance(tolerance, target) - Parameters - ---------- - return_indexer : bool, default False - Should the indices that would sort the index be returned. - ascending : bool, default True - Should the index values be sorted in an ascending order. + # Treat boolean labels passed to a numeric index as not found. Without + # this fix False and True would be treated as 0 and 1 respectively. + # (GH #16877) + if target.is_boolean() and self.is_numeric(): + return ensure_platform_int(np.repeat(-1, target.size)) - Returns - ------- - sorted_index : pandas.Index - Sorted copy of the index. - indexer : numpy.ndarray, optional - The indices that the index itself was sorted by. + pself, ptarget = self._maybe_promote(target) + if pself is not self or ptarget is not target: + return pself.get_indexer(ptarget, method=method, limit=limit, + tolerance=tolerance) - See Also - -------- - pandas.Series.sort_values : Sort values of a Series. - pandas.DataFrame.sort_values : Sort values in a DataFrame. + if not is_dtype_equal(self.dtype, target.dtype): + this = self.astype(object) + target = target.astype(object) + return this.get_indexer(target, method=method, limit=limit, + tolerance=tolerance) - Examples - -------- - >>> idx = pd.Index([10, 100, 1, 1000]) - >>> idx - Int64Index([10, 100, 1, 1000], dtype='int64') + if not self.is_unique: + raise InvalidIndexError('Reindexing only valid with uniquely' + ' valued Index objects') - Sort values in ascending order (default behavior). + if method == 'pad' or method == 'backfill': + indexer = self._get_fill_indexer(target, method, limit, tolerance) + elif method == 'nearest': + indexer = self._get_nearest_indexer(target, limit, tolerance) + else: + if tolerance is not None: + raise ValueError('tolerance argument only valid if doing pad, ' + 'backfill or nearest reindexing') + if limit is not None: + raise ValueError('limit argument only valid if doing pad, ' + 'backfill or nearest reindexing') - >>> idx.sort_values() - Int64Index([1, 10, 100, 1000], dtype='int64') + indexer = self._engine.get_indexer(target._ndarray_values) - Sort values in descending order, and also get the indices `idx` was - sorted by. + return ensure_platform_int(indexer) - >>> idx.sort_values(ascending=False, return_indexer=True) - (Int64Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2])) + def _convert_tolerance(self, tolerance, target): + # override this method on subclasses + tolerance = np.asarray(tolerance) + if target.size != tolerance.size and tolerance.size > 1: + raise ValueError('list-like tolerance size must match ' + 'target index size') + return tolerance + + def _get_fill_indexer(self, target, method, limit=None, tolerance=None): + if self.is_monotonic_increasing and target.is_monotonic_increasing: + method = (self._engine.get_pad_indexer if method == 'pad' else + self._engine.get_backfill_indexer) + indexer = method(target._ndarray_values, limit) + else: + indexer = self._get_fill_indexer_searchsorted(target, method, + limit) + if tolerance is not None: + indexer = self._filter_indexer_tolerance(target._ndarray_values, + indexer, + tolerance) + return indexer + + def _get_fill_indexer_searchsorted(self, target, method, limit=None): """ - _as = self.argsort() - if not ascending: - _as = _as[::-1] + Fallback pad/backfill get_indexer that works for monotonic decreasing + indexes and non-monotonic targets. + """ + if limit is not None: + raise ValueError('limit argument for %r method only well-defined ' + 'if index and target are monotonic' % method) - sorted_index = self.take(_as) + side = 'left' if method == 'pad' else 'right' - if return_indexer: - return sorted_index, _as + # find exact matches first (this simplifies the algorithm) + indexer = self.get_indexer(target) + nonexact = (indexer == -1) + indexer[nonexact] = self._searchsorted_monotonic(target[nonexact], + side) + if side == 'left': + # searchsorted returns "indices into a sorted array such that, + # if the corresponding elements in v were inserted before the + # indices, the order of a would be preserved". + # Thus, we need to subtract 1 to find values to the left. + indexer[nonexact] -= 1 + # This also mapped not found values (values of 0 from + # np.searchsorted) to -1, which conveniently is also our + # sentinel for missing values else: - return sorted_index - - def sort(self, *args, **kwargs): - raise TypeError("cannot sort an Index object in-place, use " - "sort_values instead") + # Mark indices to the right of the largest value as not found + indexer[indexer == len(self)] = -1 + return indexer - def shift(self, periods=1, freq=None): + def _get_nearest_indexer(self, target, limit, tolerance): """ - Shift index by desired number of time frequency increments. + Get the indexer for the nearest index labels; requires an index with + values that can be subtracted from each other (e.g., not strings or + tuples). + """ + left_indexer = self.get_indexer(target, 'pad', limit=limit) + right_indexer = self.get_indexer(target, 'backfill', limit=limit) - This method is for shifting the values of datetime-like indexes - by a specified time increment a given number of times. + target = np.asarray(target) + left_distances = abs(self.values[left_indexer] - target) + right_distances = abs(self.values[right_indexer] - target) + + op = operator.lt if self.is_monotonic_increasing else operator.le + indexer = np.where(op(left_distances, right_distances) | + (right_indexer == -1), left_indexer, right_indexer) + if tolerance is not None: + indexer = self._filter_indexer_tolerance(target, indexer, + tolerance) + return indexer + + def _filter_indexer_tolerance(self, target, indexer, tolerance): + distance = abs(self.values[indexer] - target) + indexer = np.where(distance <= tolerance, indexer, -1) + return indexer + + # -------------------------------------------------------------------- + # Indexer Conversion Methods + + _index_shared_docs['_convert_scalar_indexer'] = """ + Convert a scalar indexer. Parameters ---------- - periods : int, default 1 - Number of periods (or increments) to shift by, - can be positive or negative. - freq : pandas.DateOffset, pandas.Timedelta or string, optional - Frequency increment to shift by. - If None, the index is shifted by its own `freq` attribute. - Offset aliases are valid strings, e.g., 'D', 'W', 'M' etc. + key : label of the slice bound + kind : {'ix', 'loc', 'getitem', 'iloc'} or None + """ - Returns - ------- - pandas.Index - shifted index + @Appender(_index_shared_docs['_convert_scalar_indexer']) + def _convert_scalar_indexer(self, key, kind=None): + assert kind in ['ix', 'loc', 'getitem', 'iloc', None] - See Also - -------- - Series.shift : Shift values of Series. + if kind == 'iloc': + return self._validate_indexer('positional', key, kind) - Examples - -------- - Put the first 5 month starts of 2011 into an index. + if len(self) and not isinstance(self, ABCMultiIndex,): - >>> month_starts = pd.date_range('1/1/2011', periods=5, freq='MS') - >>> month_starts - DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01', '2011-04-01', - '2011-05-01'], - dtype='datetime64[ns]', freq='MS') + # we can raise here if we are definitive that this + # is positional indexing (eg. .ix on with a float) + # or label indexing if we are using a type able + # to be represented in the index - Shift the index by 10 days. + if kind in ['getitem', 'ix'] and is_float(key): + if not self.is_floating(): + return self._invalid_indexer('label', key) - >>> month_starts.shift(10, freq='D') - DatetimeIndex(['2011-01-11', '2011-02-11', '2011-03-11', '2011-04-11', - '2011-05-11'], - dtype='datetime64[ns]', freq=None) + elif kind in ['loc'] and is_float(key): - The default value of `freq` is the `freq` attribute of the index, - which is 'MS' (month start) in this example. + # we want to raise KeyError on string/mixed here + # technically we *could* raise a TypeError + # on anything but mixed though + if self.inferred_type not in ['floating', + 'mixed-integer-float', + 'string', + 'unicode', + 'mixed']: + return self._invalid_indexer('label', key) - >>> month_starts.shift(10) - DatetimeIndex(['2011-11-01', '2011-12-01', '2012-01-01', '2012-02-01', - '2012-03-01'], - dtype='datetime64[ns]', freq='MS') + elif kind in ['loc'] and is_integer(key): + if not self.holds_integer(): + return self._invalid_indexer('label', key) - Notes - ----- - This method is only implemented for datetime-like index classes, - i.e., DatetimeIndex, PeriodIndex and TimedeltaIndex. - """ - raise NotImplementedError("Not supported for type %s" % - type(self).__name__) + return key - def argsort(self, *args, **kwargs): - """ - Return the integer indices that would sort the index. + _index_shared_docs['_convert_slice_indexer'] = """ + Convert a slice indexer. + + By definition, these are labels unless 'iloc' is passed in. + Floats are not allowed as the start, step, or stop of the slice. Parameters ---------- - *args - Passed to `numpy.ndarray.argsort`. - **kwargs - Passed to `numpy.ndarray.argsort`. - - Returns - ------- - numpy.ndarray - Integer indices that would sort the index if used as - an indexer. + key : label of the slice bound + kind : {'ix', 'loc', 'getitem', 'iloc'} or None + """ - See Also - -------- - numpy.argsort : Similar method for NumPy arrays. - Index.sort_values : Return sorted copy of Index. + @Appender(_index_shared_docs['_convert_slice_indexer']) + def _convert_slice_indexer(self, key, kind=None): + assert kind in ['ix', 'loc', 'getitem', 'iloc', None] - Examples - -------- - >>> idx = pd.Index(['b', 'a', 'd', 'c']) - >>> idx - Index(['b', 'a', 'd', 'c'], dtype='object') + # if we are not a slice, then we are done + if not isinstance(key, slice): + return key - >>> order = idx.argsort() - >>> order - array([1, 0, 3, 2]) - - >>> idx[order] - Index(['a', 'b', 'c', 'd'], dtype='object') - """ - result = self.asi8 - if result is None: - result = np.array(self) - return result.argsort(*args, **kwargs) - - # -------------------------------------------------------------------- - # Arithmetic & Logical Methods - - def __add__(self, other): - if isinstance(other, (ABCSeries, ABCDataFrame)): - return NotImplemented - return Index(np.array(self) + other) - - def __radd__(self, other): - return Index(other + np.array(self)) - - def __iadd__(self, other): - # alias for __add__ - return self + other - - def __sub__(self, other): - return Index(np.array(self) - other) + # validate iloc + if kind == 'iloc': + return slice(self._validate_indexer('slice', key.start, kind), + self._validate_indexer('slice', key.stop, kind), + self._validate_indexer('slice', key.step, kind)) - def __rsub__(self, other): - return Index(other - np.array(self)) + # potentially cast the bounds to integers + start, stop, step = key.start, key.stop, key.step - def __and__(self, other): - return self.intersection(other) + # figure out if this is a positional indexer + def is_int(v): + return v is None or is_integer(v) - def __or__(self, other): - return self.union(other) + is_null_slicer = start is None and stop is None + is_index_slice = is_int(start) and is_int(stop) + is_positional = is_index_slice and not self.is_integer() - def __xor__(self, other): - return self.symmetric_difference(other) + if kind == 'getitem': + """ + called from the getitem slicers, validate that we are in fact + integers + """ + if self.is_integer() or is_index_slice: + return slice(self._validate_indexer('slice', key.start, kind), + self._validate_indexer('slice', key.stop, kind), + self._validate_indexer('slice', key.step, kind)) - def __nonzero__(self): - raise ValueError("The truth value of a {0} is ambiguous. " - "Use a.empty, a.bool(), a.item(), a.any() or a.all()." - .format(self.__class__.__name__)) + # convert the slice to an indexer here - __bool__ = __nonzero__ + # if we are mixed and have integers + try: + if is_positional and self.is_mixed(): + # Validate start & stop + if start is not None: + self.get_loc(start) + if stop is not None: + self.get_loc(stop) + is_positional = False + except KeyError: + if self.inferred_type == 'mixed-integer-float': + raise - # -------------------------------------------------------------------- - # Set Operation Methods + if is_null_slicer: + indexer = key + elif is_positional: + indexer = key + else: + try: + indexer = self.slice_indexer(start, stop, step, kind=kind) + except Exception: + if is_index_slice: + if self.is_integer(): + raise + else: + indexer = key + else: + raise - def _get_reconciled_name_object(self, other): - """ - If the result of a set operation will be self, - return self, unless the name changes, in which - case make a shallow copy of self. - """ - name = get_op_result_name(self, other) - if self.name != name: - return self._shallow_copy(name=name) - return self + return indexer - def union(self, other): + def _convert_listlike_indexer(self, keyarr, kind=None): """ - Form the union of two Index objects and sorts if possible. - Parameters ---------- - other : Index or array-like + keyarr : list-like + Indexer to convert. Returns ------- - union : Index - - Examples - -------- - - >>> idx1 = pd.Index([1, 2, 3, 4]) - >>> idx2 = pd.Index([3, 4, 5, 6]) - >>> idx1.union(idx2) - Int64Index([1, 2, 3, 4, 5, 6], dtype='int64') + tuple (indexer, keyarr) + indexer is an ndarray or None if cannot convert + keyarr are tuple-safe keys """ - self._assert_can_do_setop(other) - other = ensure_index(other) - - if len(other) == 0 or self.equals(other): - return self._get_reconciled_name_object(other) - - if len(self) == 0: - return other._get_reconciled_name_object(self) - - # TODO: is_dtype_union_equal is a hack around - # 1. buggy set ops with duplicates (GH #13432) - # 2. CategoricalIndex lacking setops (GH #10186) - # Once those are fixed, this workaround can be removed - if not is_dtype_union_equal(self.dtype, other.dtype): - this = self.astype('O') - other = other.astype('O') - return this.union(other) - - # TODO(EA): setops-refactor, clean all this up - if is_period_dtype(self) or is_datetime64tz_dtype(self): - lvals = self._ndarray_values - else: - lvals = self._values - if is_period_dtype(other) or is_datetime64tz_dtype(other): - rvals = other._ndarray_values + if isinstance(keyarr, Index): + keyarr = self._convert_index_indexer(keyarr) else: - rvals = other._values + keyarr = self._convert_arr_indexer(keyarr) - if self.is_monotonic and other.is_monotonic: - try: - result = self._outer_indexer(lvals, rvals)[0] - except TypeError: - # incomparable objects - result = list(lvals) + indexer = self._convert_list_indexer(keyarr, kind=kind) + return indexer, keyarr - # worth making this faster? a very unusual case - value_set = set(lvals) - result.extend([x for x in rvals if x not in value_set]) - else: - indexer = self.get_indexer(other) - indexer, = (indexer == -1).nonzero() + _index_shared_docs['_convert_arr_indexer'] = """ + Convert an array-like indexer to the appropriate dtype. - if len(indexer) > 0: - other_diff = algos.take_nd(rvals, indexer, - allow_fill=False) - result = _concat._concat_compat((lvals, other_diff)) + Parameters + ---------- + keyarr : array-like + Indexer to convert. - try: - lvals[0] < other_diff[0] - except TypeError as e: - warnings.warn("%s, sort order is undefined for " - "incomparable objects" % e, RuntimeWarning, - stacklevel=3) - else: - types = frozenset((self.inferred_type, - other.inferred_type)) - if not types & _unsortable_types: - result.sort() + Returns + ------- + converted_keyarr : array-like + """ - else: - result = lvals + @Appender(_index_shared_docs['_convert_arr_indexer']) + def _convert_arr_indexer(self, keyarr): + keyarr = com.asarray_tuplesafe(keyarr) + return keyarr - try: - result = np.sort(result) - except TypeError as e: - warnings.warn("%s, sort order is undefined for " - "incomparable objects" % e, RuntimeWarning, - stacklevel=3) + _index_shared_docs['_convert_index_indexer'] = """ + Convert an Index indexer to the appropriate dtype. - # for subclasses - return self._wrap_setop_result(other, result) + Parameters + ---------- + keyarr : Index (or sub-class) + Indexer to convert. - def _wrap_setop_result(self, other, result): - return self._constructor(result, name=get_op_result_name(self, other)) + Returns + ------- + converted_keyarr : Index (or sub-class) + """ - def intersection(self, other): - """ - Form the intersection of two Index objects. + @Appender(_index_shared_docs['_convert_index_indexer']) + def _convert_index_indexer(self, keyarr): + return keyarr - This returns a new Index with elements common to the index and `other`, - preserving the order of the calling index. + _index_shared_docs['_convert_list_indexer'] = """ + Convert a list-like indexer to the appropriate dtype. Parameters ---------- - other : Index or array-like + keyarr : Index (or sub-class) + Indexer to convert. + kind : iloc, ix, loc, optional Returns ------- - intersection : Index - - Examples - -------- - - >>> idx1 = pd.Index([1, 2, 3, 4]) - >>> idx2 = pd.Index([3, 4, 5, 6]) - >>> idx1.intersection(idx2) - Int64Index([3, 4], dtype='int64') - """ - self._assert_can_do_setop(other) - other = ensure_index(other) - - if self.equals(other): - return self._get_reconciled_name_object(other) + positional indexer or None + """ - if not is_dtype_equal(self.dtype, other.dtype): - this = self.astype('O') - other = other.astype('O') - return this.intersection(other) + @Appender(_index_shared_docs['_convert_list_indexer']) + def _convert_list_indexer(self, keyarr, kind=None): + if (kind in [None, 'iloc', 'ix'] and + is_integer_dtype(keyarr) and not self.is_floating() and + not isinstance(keyarr, ABCPeriodIndex)): - # TODO(EA): setops-refactor, clean all this up - if is_period_dtype(self): - lvals = self._ndarray_values - else: - lvals = self._values - if is_period_dtype(other): - rvals = other._ndarray_values - else: - rvals = other._values - - if self.is_monotonic and other.is_monotonic: - try: - result = self._inner_indexer(lvals, rvals)[0] - return self._wrap_setop_result(other, result) - except TypeError: - pass + if self.inferred_type == 'mixed-integer': + indexer = self.get_indexer(keyarr) + if (indexer >= 0).all(): + return indexer + # missing values are flagged as -1 by get_indexer and negative + # indices are already converted to positive indices in the + # above if-statement, so the negative flags are changed to + # values outside the range of indices so as to trigger an + # IndexError in maybe_convert_indices + indexer[indexer < 0] = len(self) + from pandas.core.indexing import maybe_convert_indices + return maybe_convert_indices(indexer, len(self)) - try: - indexer = Index(rvals).get_indexer(lvals) - indexer = indexer.take((indexer != -1).nonzero()[0]) - except Exception: - # duplicates - indexer = algos.unique1d( - Index(rvals).get_indexer_non_unique(lvals)[0]) - indexer = indexer[indexer != -1] + elif not self.inferred_type == 'integer': + keyarr = np.where(keyarr < 0, len(self) + keyarr, keyarr) + return keyarr - taken = other.take(indexer) - if self.name != other.name: - taken.name = None - return taken + return None - def difference(self, other, sort=True): + def _invalid_indexer(self, form, key): """ - Return a new Index with elements from the index that are not in - `other`. + Consistent invalid indexer message. + """ + raise TypeError("cannot do {form} indexing on {klass} with these " + "indexers [{key}] of {kind}".format( + form=form, klass=type(self), key=key, + kind=type(key))) - This is the set difference of two Index objects. + # -------------------------------------------------------------------- + # Reindex Methods + + def _can_reindex(self, indexer): + """ + Check if we are allowing reindexing with this particular indexer. Parameters ---------- - other : Index or array-like - sort : bool, default True - Sort the resulting index if possible + indexer : an integer indexer - .. versionadded:: 0.24.0 + Raises + ------ + ValueError if its a duplicate axis + """ + + # trying to reindex on an axis with duplicates + if not self.is_unique and len(indexer): + raise ValueError("cannot reindex from a duplicate axis") + + def reindex(self, target, method=None, level=None, limit=None, + tolerance=None): + """ + Create index with target's values (move/add/delete values + as necessary). + + Parameters + ---------- + target : an iterable Returns ------- - difference : Index - - Examples - -------- + new_index : pd.Index + Resulting index + indexer : np.ndarray or None + Indices of output values in original index - >>> idx1 = pd.Index([2, 1, 3, 4]) - >>> idx2 = pd.Index([3, 4, 5, 6]) - >>> idx1.difference(idx2) - Int64Index([1, 2], dtype='int64') - >>> idx1.difference(idx2, sort=False) - Int64Index([2, 1], dtype='int64') """ - self._assert_can_do_setop(other) + # GH6552: preserve names when reindexing to non-named target + # (i.e. neither Index nor Series). + preserve_names = not hasattr(target, 'name') - if self.equals(other): - # pass an empty np.ndarray with the appropriate dtype - return self._shallow_copy(self._data[:0]) + # GH7774: preserve dtype/tz if target is empty and not an Index. + target = _ensure_has_len(target) # target may be an iterator - other, result_name = self._convert_can_do_setop(other) + if not isinstance(target, Index) and len(target) == 0: + attrs = self._get_attributes_dict() + attrs.pop('freq', None) # don't preserve freq + values = self._data[:0] # appropriately-dtyped empty array + target = self._simple_new(values, dtype=self.dtype, **attrs) + else: + target = ensure_index(target) - this = self._get_unique_index() + if level is not None: + if method is not None: + raise TypeError('Fill method not supported if level passed') + _, indexer, _ = self._join_level(target, level, how='right', + return_indexers=True) + else: + if self.equals(target): + indexer = None + else: - indexer = this.get_indexer(other) - indexer = indexer.take((indexer != -1).nonzero()[0]) + if self.is_unique: + indexer = self.get_indexer(target, method=method, + limit=limit, + tolerance=tolerance) + else: + if method is not None or limit is not None: + raise ValueError("cannot reindex a non-unique index " + "with a method or limit") + indexer, missing = self.get_indexer_non_unique(target) - label_diff = np.setdiff1d(np.arange(this.size), indexer, - assume_unique=True) - the_diff = this.values.take(label_diff) - if sort: - try: - the_diff = sorting.safe_sort(the_diff) - except TypeError: - pass + if preserve_names and target.nlevels == 1 and target.name != self.name: + target = target.copy() + target.name = self.name - return this._shallow_copy(the_diff, name=result_name, freq=None) + return target, indexer - def symmetric_difference(self, other, result_name=None): + def _reindex_non_unique(self, target): """ - Compute the symmetric difference of two Index objects. - - It's sorted if sorting is possible. + Create a new index with target's values (move/add/delete values as + necessary) use with non-unique Index and a possibly non-unique target. Parameters ---------- - other : Index or array-like - result_name : str + target : an iterable Returns ------- - symmetric_difference : Index + new_index : pd.Index + Resulting index + indexer : np.ndarray or None + Indices of output values in original index - Notes - ----- - ``symmetric_difference`` contains elements that appear in either - ``idx1`` or ``idx2`` but not both. Equivalent to the Index created by - ``idx1.difference(idx2) | idx2.difference(idx1)`` with duplicates - dropped. + """ - Examples - -------- - >>> idx1 = pd.Index([1, 2, 3, 4]) - >>> idx2 = pd.Index([2, 3, 4, 5]) - >>> idx1.symmetric_difference(idx2) - Int64Index([1, 5], dtype='int64') + target = ensure_index(target) + indexer, missing = self.get_indexer_non_unique(target) + check = indexer != -1 + new_labels = self.take(indexer[check]) + new_indexer = None - You can also use the ``^`` operator: + if len(missing): + length = np.arange(len(indexer)) - >>> idx1 ^ idx2 - Int64Index([1, 5], dtype='int64') - """ - self._assert_can_do_setop(other) - other, result_name_update = self._convert_can_do_setop(other) - if result_name is None: - result_name = result_name_update + missing = ensure_platform_int(missing) + missing_labels = target.take(missing) + missing_indexer = ensure_int64(length[~check]) + cur_labels = self.take(indexer[check]).values + cur_indexer = ensure_int64(length[check]) - this = self._get_unique_index() - other = other._get_unique_index() - indexer = this.get_indexer(other) + new_labels = np.empty(tuple([len(indexer)]), dtype=object) + new_labels[cur_indexer] = cur_labels + new_labels[missing_indexer] = missing_labels - # {this} minus {other} - common_indexer = indexer.take((indexer != -1).nonzero()[0]) - left_indexer = np.setdiff1d(np.arange(this.size), common_indexer, - assume_unique=True) - left_diff = this.values.take(left_indexer) + # a unique indexer + if target.is_unique: - # {other} minus {this} - right_indexer = (indexer == -1).nonzero()[0] - right_diff = other.values.take(right_indexer) + # see GH5553, make sure we use the right indexer + new_indexer = np.arange(len(indexer)) + new_indexer[cur_indexer] = np.arange(len(cur_labels)) + new_indexer[missing_indexer] = -1 - the_diff = _concat._concat_compat([left_diff, right_diff]) - try: - the_diff = sorting.safe_sort(the_diff) - except TypeError: - pass + # we have a non_unique selector, need to use the original + # indexer here + else: - attribs = self._get_attributes_dict() - attribs['name'] = result_name - if 'freq' in attribs: - attribs['freq'] = None - return self._shallow_copy_with_infer(the_diff, **attribs) + # need to retake to have the same size as the indexer + indexer[~check] = -1 - def _assert_can_do_setop(self, other): - if not is_list_like(other): - raise TypeError('Input must be Index or array-like') - return True + # reset the new indexer to account for the new size + new_indexer = np.arange(len(self.take(indexer))) + new_indexer[~check] = -1 - def _convert_can_do_setop(self, other): - if not isinstance(other, Index): - other = Index(other, name=self.name) - result_name = self.name - else: - result_name = get_op_result_name(self, other) - return other, result_name + new_index = self._shallow_copy_with_infer(new_labels, freq=None) + return new_index, indexer, new_indexer # -------------------------------------------------------------------- - # Indexing Methods + # Join Methods - _index_shared_docs['get_loc'] = """ - Get integer location, slice or boolean mask for requested label. + _index_shared_docs['join'] = """ + Compute join_index and indexers to conform data + structures to the new index. Parameters ---------- - key : label - method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional - * default: exact matches only. - * pad / ffill: find the PREVIOUS index value if no exact match. - * backfill / bfill: use NEXT index value if no exact match - * nearest: use the NEAREST index value if no exact match. Tied - distances are broken by preferring the larger index value. - tolerance : optional - Maximum distance from index value for inexact matches. The value of - the index at the matching location most satisfy the equation - ``abs(index[loc] - key) <= tolerance``. + other : Index + how : {'left', 'right', 'inner', 'outer'} + level : int or level name, default None + return_indexers : boolean, default False + sort : boolean, default False + Sort the join keys lexicographically in the result Index. If False, + the order of the join keys depends on the join type (how keyword) + + .. versionadded:: 0.20.0 + + Returns + ------- + join_index, (left_indexer, right_indexer) + """ + + @Appender(_index_shared_docs['join']) + def join(self, other, how='left', level=None, return_indexers=False, + sort=False): + from .multi import MultiIndex + self_is_mi = isinstance(self, MultiIndex) + other_is_mi = isinstance(other, MultiIndex) + + # try to figure out the join level + # GH3662 + if level is None and (self_is_mi or other_is_mi): + + # have the same levels/names so a simple join + if self.names == other.names: + pass + else: + return self._join_multi(other, how=how, + return_indexers=return_indexers) + + # join on the level + if level is not None and (self_is_mi or other_is_mi): + return self._join_level(other, level, how=how, + return_indexers=return_indexers) + + other = ensure_index(other) + + if len(other) == 0 and how in ('left', 'outer'): + join_index = self._shallow_copy() + if return_indexers: + rindexer = np.repeat(-1, len(join_index)) + return join_index, None, rindexer + else: + return join_index + + if len(self) == 0 and how in ('right', 'outer'): + join_index = other._shallow_copy() + if return_indexers: + lindexer = np.repeat(-1, len(join_index)) + return join_index, lindexer, None + else: + return join_index + + if self._join_precedence < other._join_precedence: + how = {'right': 'left', 'left': 'right'}.get(how, how) + result = other.join(self, how=how, level=level, + return_indexers=return_indexers) + if return_indexers: + x, y, z = result + result = x, z, y + return result + + if not is_dtype_equal(self.dtype, other.dtype): + this = self.astype('O') + other = other.astype('O') + return this.join(other, how=how, return_indexers=return_indexers) + + _validate_join_method(how) + + if not self.is_unique and not other.is_unique: + return self._join_non_unique(other, how=how, + return_indexers=return_indexers) + elif not self.is_unique or not other.is_unique: + if self.is_monotonic and other.is_monotonic: + return self._join_monotonic(other, how=how, + return_indexers=return_indexers) + else: + return self._join_non_unique(other, how=how, + return_indexers=return_indexers) + elif self.is_monotonic and other.is_monotonic: + try: + return self._join_monotonic(other, how=how, + return_indexers=return_indexers) + except TypeError: + pass + + if how == 'left': + join_index = self + elif how == 'right': + join_index = other + elif how == 'inner': + join_index = self.intersection(other) + elif how == 'outer': + join_index = self.union(other) + + if sort: + join_index = join_index.sort_values() + + if return_indexers: + if join_index is self: + lindexer = None + else: + lindexer = self.get_indexer(join_index) + if join_index is other: + rindexer = None + else: + rindexer = other.get_indexer(join_index) + return join_index, lindexer, rindexer + else: + return join_index + + def _join_multi(self, other, how, return_indexers=True): + from .multi import MultiIndex + from pandas.core.reshape.merge import _restore_dropped_levels_multijoin + + # figure out join names + self_names = set(com._not_none(*self.names)) + other_names = set(com._not_none(*other.names)) + overlap = self_names & other_names + + # need at least 1 in common + if not overlap: + raise ValueError("cannot join with no overlapping index names") + + self_is_mi = isinstance(self, MultiIndex) + other_is_mi = isinstance(other, MultiIndex) + + if self_is_mi and other_is_mi: + + # Drop the non-matching levels from left and right respectively + ldrop_names = list(self_names - overlap) + rdrop_names = list(other_names - overlap) + + self_jnlevels = self.droplevel(ldrop_names) + other_jnlevels = other.droplevel(rdrop_names) + + # Join left and right + # Join on same leveled multi-index frames is supported + join_idx, lidx, ridx = self_jnlevels.join(other_jnlevels, how, + return_indexers=True) + + # Restore the dropped levels + # Returned index level order is + # common levels, ldrop_names, rdrop_names + dropped_names = ldrop_names + rdrop_names + + levels, labels, names = ( + _restore_dropped_levels_multijoin(self, other, + dropped_names, + join_idx, + lidx, ridx)) + + # Re-create the multi-index + multi_join_idx = MultiIndex(levels=levels, labels=labels, + names=names, verify_integrity=False) + + multi_join_idx = multi_join_idx.remove_unused_levels() + + return multi_join_idx, lidx, ridx + + jl = list(overlap)[0] + + # Case where only one index is multi + # make the indices into mi's that match + flip_order = False + if self_is_mi: + self, other = other, self + flip_order = True + # flip if join method is right or left + how = {'right': 'left', 'left': 'right'}.get(how, how) + + level = other.names.index(jl) + result = self._join_level(other, level, how=how, + return_indexers=return_indexers) + + if flip_order: + if isinstance(result, tuple): + return result[0], result[2], result[1] + return result + + def _join_non_unique(self, other, how='left', return_indexers=False): + from pandas.core.reshape.merge import _get_join_indexers + + left_idx, right_idx = _get_join_indexers([self._ndarray_values], + [other._ndarray_values], + how=how, + sort=True) + + left_idx = ensure_platform_int(left_idx) + right_idx = ensure_platform_int(right_idx) + + join_index = np.asarray(self._ndarray_values.take(left_idx)) + mask = left_idx == -1 + np.putmask(join_index, mask, other._ndarray_values.take(right_idx)) + + join_index = self._wrap_joined_index(join_index, other) + + if return_indexers: + return join_index, left_idx, right_idx + else: + return join_index + + def _join_level(self, other, level, how='left', return_indexers=False, + keep_order=True): + """ + The join method *only* affects the level of the resulting + MultiIndex. Otherwise it just exactly aligns the Index data to the + labels of the level in the MultiIndex. + + If ```keep_order == True```, the order of the data indexed by the + MultiIndex will not be changed; otherwise, it will tie out + with `other`. + """ + from .multi import MultiIndex + + def _get_leaf_sorter(labels): + """ + Returns sorter for the inner most level while preserving the + order of higher levels. + """ + if labels[0].size == 0: + return np.empty(0, dtype='int64') + + if len(labels) == 1: + lab = ensure_int64(labels[0]) + sorter, _ = libalgos.groupsort_indexer(lab, 1 + lab.max()) + return sorter + + # find indexers of beginning of each set of + # same-key labels w.r.t all but last level + tic = labels[0][:-1] != labels[0][1:] + for lab in labels[1:-1]: + tic |= lab[:-1] != lab[1:] + + starts = np.hstack(([True], tic, [True])).nonzero()[0] + lab = ensure_int64(labels[-1]) + return lib.get_level_sorter(lab, ensure_int64(starts)) + + if isinstance(self, MultiIndex) and isinstance(other, MultiIndex): + raise TypeError('Join on level between two MultiIndex objects ' + 'is ambiguous') + + left, right = self, other + + flip_order = not isinstance(self, MultiIndex) + if flip_order: + left, right = right, left + how = {'right': 'left', 'left': 'right'}.get(how, how) + + level = left._get_level_number(level) + old_level = left.levels[level] + + if not right.is_unique: + raise NotImplementedError('Index._join_level on non-unique index ' + 'is not implemented') + + new_level, left_lev_indexer, right_lev_indexer = \ + old_level.join(right, how=how, return_indexers=True) + + if left_lev_indexer is None: + if keep_order or len(left) == 0: + left_indexer = None + join_index = left + else: # sort the leaves + left_indexer = _get_leaf_sorter(left.labels[:level + 1]) + join_index = left[left_indexer] + + else: + left_lev_indexer = ensure_int64(left_lev_indexer) + rev_indexer = lib.get_reverse_indexer(left_lev_indexer, + len(old_level)) + + new_lev_labels = algos.take_nd(rev_indexer, left.labels[level], + allow_fill=False) + + new_labels = list(left.labels) + new_labels[level] = new_lev_labels + + new_levels = list(left.levels) + new_levels[level] = new_level + + if keep_order: # just drop missing values. o.w. keep order + left_indexer = np.arange(len(left), dtype=np.intp) + mask = new_lev_labels != -1 + if not mask.all(): + new_labels = [lab[mask] for lab in new_labels] + left_indexer = left_indexer[mask] + + else: # tie out the order with other + if level == 0: # outer most level, take the fast route + ngroups = 1 + new_lev_labels.max() + left_indexer, counts = libalgos.groupsort_indexer( + new_lev_labels, ngroups) + + # missing values are placed first; drop them! + left_indexer = left_indexer[counts[0]:] + new_labels = [lab[left_indexer] for lab in new_labels] + + else: # sort the leaves + mask = new_lev_labels != -1 + mask_all = mask.all() + if not mask_all: + new_labels = [lab[mask] for lab in new_labels] + + left_indexer = _get_leaf_sorter(new_labels[:level + 1]) + new_labels = [lab[left_indexer] for lab in new_labels] + + # left_indexers are w.r.t masked frame. + # reverse to original frame! + if not mask_all: + left_indexer = mask.nonzero()[0][left_indexer] + + join_index = MultiIndex(levels=new_levels, labels=new_labels, + names=left.names, verify_integrity=False) + + if right_lev_indexer is not None: + right_indexer = algos.take_nd(right_lev_indexer, + join_index.labels[level], + allow_fill=False) + else: + right_indexer = join_index.labels[level] + + if flip_order: + left_indexer, right_indexer = right_indexer, left_indexer + + if return_indexers: + left_indexer = (None if left_indexer is None + else ensure_platform_int(left_indexer)) + right_indexer = (None if right_indexer is None + else ensure_platform_int(right_indexer)) + return join_index, left_indexer, right_indexer + else: + return join_index + + def _join_monotonic(self, other, how='left', return_indexers=False): + if self.equals(other): + ret_index = other if how == 'right' else self + if return_indexers: + return ret_index, None, None + else: + return ret_index + + sv = self._ndarray_values + ov = other._ndarray_values + + if self.is_unique and other.is_unique: + # We can perform much better than the general case + if how == 'left': + join_index = self + lidx = None + ridx = self._left_indexer_unique(sv, ov) + elif how == 'right': + join_index = other + lidx = self._left_indexer_unique(ov, sv) + ridx = None + elif how == 'inner': + join_index, lidx, ridx = self._inner_indexer(sv, ov) + join_index = self._wrap_joined_index(join_index, other) + elif how == 'outer': + join_index, lidx, ridx = self._outer_indexer(sv, ov) + join_index = self._wrap_joined_index(join_index, other) + else: + if how == 'left': + join_index, lidx, ridx = self._left_indexer(sv, ov) + elif how == 'right': + join_index, ridx, lidx = self._left_indexer(ov, sv) + elif how == 'inner': + join_index, lidx, ridx = self._inner_indexer(sv, ov) + elif how == 'outer': + join_index, lidx, ridx = self._outer_indexer(sv, ov) + join_index = self._wrap_joined_index(join_index, other) + + if return_indexers: + lidx = None if lidx is None else ensure_platform_int(lidx) + ridx = None if ridx is None else ensure_platform_int(ridx) + return join_index, lidx, ridx + else: + return join_index + + def _wrap_joined_index(self, joined, other): + name = get_op_result_name(self, other) + return Index(joined, name=name) + + # -------------------------------------------------------------------- + # Uncategorized Methods + + @property + def values(self): + """ + Return the underlying data as an ndarray. + """ + return self._data.view(np.ndarray) + + @property + def _values(self): + # type: () -> Union[ExtensionArray, Index, np.ndarray] + # TODO(EA): remove index types as they become extension arrays + """ + The best array representation. + + This is an ndarray, ExtensionArray, or Index subclass. This differs + from ``_ndarray_values``, which always returns an ndarray. + + Both ``_values`` and ``_ndarray_values`` are consistent between + ``Series`` and ``Index``. + + It may differ from the public '.values' method. + + index | values | _values | _ndarray_values | + ----------------- | --------------- | ------------- | --------------- | + Index | ndarray | ndarray | ndarray | + CategoricalIndex | Categorical | Categorical | ndarray[int] | + DatetimeIndex | ndarray[M8ns] | ndarray[M8ns] | ndarray[M8ns] | + DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] | ndarray[M8ns] | + PeriodIndex | ndarray[object] | PeriodArray | ndarray[int] | + IntervalIndex | IntervalArray | IntervalArray | ndarray[object] | + + See Also + -------- + values + _ndarray_values + """ + return self.values + + def get_values(self): + """ + Return `Index` data as an `numpy.ndarray`. + + Returns + ------- + numpy.ndarray + A one-dimensional numpy array of the `Index` values. + + See Also + -------- + Index.values : The attribute that get_values wraps. + + Examples + -------- + Getting the `Index` values of a `DataFrame`: + + >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + ... index=['a', 'b', 'c'], columns=['A', 'B', 'C']) + >>> df + A B C + a 1 2 3 + b 4 5 6 + c 7 8 9 + >>> df.index.get_values() + array(['a', 'b', 'c'], dtype=object) - Tolerance may be a scalar - value, which applies the same tolerance to all values, or - list-like, which applies variable tolerance per element. List-like - includes list, tuple, array, Series, and must be the same size as - the index and its dtype must exactly match the index's type. + Standalone `Index` values: - .. versionadded:: 0.21.0 (list-like tolerance) + >>> idx = pd.Index(['1', '2', '3']) + >>> idx.get_values() + array(['1', '2', '3'], dtype=object) - Returns - ------- - loc : int if unique index, slice if monotonic index, else mask + `MultiIndex` arrays also have only one dimension: - Examples - --------- - >>> unique_index = pd.Index(list('abc')) - >>> unique_index.get_loc('b') + >>> midx = pd.MultiIndex.from_arrays([[1, 2, 3], ['a', 'b', 'c']], + ... names=('number', 'letter')) + >>> midx.get_values() + array([(1, 'a'), (2, 'b'), (3, 'c')], dtype=object) + >>> midx.get_values().ndim 1 + """ + return self.values - >>> monotonic_index = pd.Index(list('abbc')) - >>> monotonic_index.get_loc('b') - slice(1, 3, None) + @Appender(IndexOpsMixin.memory_usage.__doc__) + def memory_usage(self, deep=False): + result = super(Index, self).memory_usage(deep=deep) - >>> non_monotonic_index = pd.Index(list('abcb')) - >>> non_monotonic_index.get_loc('b') - array([False, True, False, True], dtype=bool) - """ + # include our engine hashtable + result += self._engine.sizeof(deep=deep) + return result - @Appender(_index_shared_docs['get_loc']) - def get_loc(self, key, method=None, tolerance=None): - if method is None: - if tolerance is not None: - raise ValueError('tolerance argument only valid if using pad, ' - 'backfill or nearest lookups') - try: - return self._engine.get_loc(key) - except KeyError: - return self._engine.get_loc(self._maybe_cast_indexer(key)) - indexer = self.get_indexer([key], method=method, tolerance=tolerance) - if indexer.ndim > 1 or indexer.size > 1: - raise TypeError('get_loc requires scalar valued input') - loc = indexer.item() - if loc == -1: - raise KeyError(key) - return loc + _index_shared_docs['where'] = """ + Return an Index of same shape as self and whose corresponding + entries are from self where cond is True and otherwise are from + other. - _index_shared_docs['get_indexer'] = """ - Compute indexer and mask for new index given the current index. The - indexer should be then used as an input to ndarray.take to align the - current data to the new index. + .. versionadded:: 0.19.0 Parameters ---------- - target : %(target_klass)s - method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional - * default: exact matches only. - * pad / ffill: find the PREVIOUS index value if no exact match. - * backfill / bfill: use NEXT index value if no exact match - * nearest: use the NEAREST index value if no exact match. Tied - distances are broken by preferring the larger index value. - limit : int, optional - Maximum number of consecutive labels in ``target`` to match for - inexact matches. - tolerance : optional - Maximum distance between original and new labels for inexact - matches. The values of the index at the matching locations most - satisfy the equation ``abs(index[indexer] - target) <= tolerance``. - - Tolerance may be a scalar value, which applies the same tolerance - to all values, or list-like, which applies variable tolerance per - element. List-like includes list, tuple, array, Series, and must be - the same size as the index and its dtype must exactly match the - index's type. - - .. versionadded:: 0.21.0 (list-like tolerance) + cond : boolean array-like with the same length as self + other : scalar, or array-like + """ - Returns - ------- - indexer : ndarray of int - Integers from 0 to n - 1 indicating that the index at these - positions matches the corresponding target values. Missing values - in the target are marked by -1. + @Appender(_index_shared_docs['where']) + def where(self, cond, other=None): + if other is None: + other = self._na_value - Examples - -------- - >>> index = pd.Index(['c', 'a', 'b']) - >>> index.get_indexer(['a', 'b', 'x']) - array([ 1, 2, -1]) + dtype = self.dtype + values = self.values - Notice that the return value is an array of locations in ``index`` - and ``x`` is marked by -1, as it is not in ``index``. - """ + if is_bool(other) or is_bool_dtype(other): - @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) - def get_indexer(self, target, method=None, limit=None, tolerance=None): - method = missing.clean_reindex_fill_method(method) - target = ensure_index(target) - if tolerance is not None: - tolerance = self._convert_tolerance(tolerance, target) + # bools force casting + values = values.astype(object) + dtype = None - # Treat boolean labels passed to a numeric index as not found. Without - # this fix False and True would be treated as 0 and 1 respectively. - # (GH #16877) - if target.is_boolean() and self.is_numeric(): - return ensure_platform_int(np.repeat(-1, target.size)) + values = np.where(cond, values, other) - pself, ptarget = self._maybe_promote(target) - if pself is not self or ptarget is not target: - return pself.get_indexer(ptarget, method=method, limit=limit, - tolerance=tolerance) + if self._is_numeric_dtype and np.any(isna(values)): + # We can't coerce to the numeric dtype of "self" (unless + # it's float) if there are NaN values in our output. + dtype = None - if not is_dtype_equal(self.dtype, target.dtype): - this = self.astype(object) - target = target.astype(object) - return this.get_indexer(target, method=method, limit=limit, - tolerance=tolerance) + return self._shallow_copy_with_infer(values, dtype=dtype) - if not self.is_unique: - raise InvalidIndexError('Reindexing only valid with uniquely' - ' valued Index objects') + # construction helpers + @classmethod + def _try_convert_to_int_index(cls, data, copy, name, dtype): + """ + Attempt to convert an array of data into an integer index. - if method == 'pad' or method == 'backfill': - indexer = self._get_fill_indexer(target, method, limit, tolerance) - elif method == 'nearest': - indexer = self._get_nearest_indexer(target, limit, tolerance) - else: - if tolerance is not None: - raise ValueError('tolerance argument only valid if doing pad, ' - 'backfill or nearest reindexing') - if limit is not None: - raise ValueError('limit argument only valid if doing pad, ' - 'backfill or nearest reindexing') + Parameters + ---------- + data : The data to convert. + copy : Whether to copy the data or not. + name : The name of the index returned. - indexer = self._engine.get_indexer(target._ndarray_values) + Returns + ------- + int_index : data converted to either an Int64Index or a + UInt64Index - return ensure_platform_int(indexer) + Raises + ------ + ValueError if the conversion was not successful. + """ - def _convert_tolerance(self, tolerance, target): - # override this method on subclasses - tolerance = np.asarray(tolerance) - if target.size != tolerance.size and tolerance.size > 1: - raise ValueError('list-like tolerance size must match ' - 'target index size') - return tolerance + from .numeric import Int64Index, UInt64Index + if not is_unsigned_integer_dtype(dtype): + # skip int64 conversion attempt if uint-like dtype is passed, as + # this could return Int64Index when UInt64Index is what's desrired + try: + res = data.astype('i8', copy=False) + if (res == data).all(): + return Int64Index(res, copy=copy, name=name) + except (OverflowError, TypeError, ValueError): + pass - def _get_fill_indexer(self, target, method, limit=None, tolerance=None): - if self.is_monotonic_increasing and target.is_monotonic_increasing: - method = (self._engine.get_pad_indexer if method == 'pad' else - self._engine.get_backfill_indexer) - indexer = method(target._ndarray_values, limit) - else: - indexer = self._get_fill_indexer_searchsorted(target, method, - limit) - if tolerance is not None: - indexer = self._filter_indexer_tolerance(target._ndarray_values, - indexer, - tolerance) - return indexer + # Conversion to int64 failed (possibly due to overflow) or was skipped, + # so let's try now with uint64. + try: + res = data.astype('u8', copy=False) + if (res == data).all(): + return UInt64Index(res, copy=copy, name=name) + except (OverflowError, TypeError, ValueError): + pass - def _get_fill_indexer_searchsorted(self, target, method, limit=None): - """ - Fallback pad/backfill get_indexer that works for monotonic decreasing - indexes and non-monotonic targets. - """ - if limit is not None: - raise ValueError('limit argument for %r method only well-defined ' - 'if index and target are monotonic' % method) + raise ValueError - side = 'left' if method == 'pad' else 'right' + @classmethod + def _scalar_data_error(cls, data): + raise TypeError('{0}(...) must be called with a collection of some ' + 'kind, {1} was passed'.format(cls.__name__, + repr(data))) - # find exact matches first (this simplifies the algorithm) - indexer = self.get_indexer(target) - nonexact = (indexer == -1) - indexer[nonexact] = self._searchsorted_monotonic(target[nonexact], - side) - if side == 'left': - # searchsorted returns "indices into a sorted array such that, - # if the corresponding elements in v were inserted before the - # indices, the order of a would be preserved". - # Thus, we need to subtract 1 to find values to the left. - indexer[nonexact] -= 1 - # This also mapped not found values (values of 0 from - # np.searchsorted) to -1, which conveniently is also our - # sentinel for missing values - else: - # Mark indices to the right of the largest value as not found - indexer[indexer == len(self)] = -1 - return indexer + @classmethod + def _string_data_error(cls, data): + raise TypeError('String dtype not supported, you may need ' + 'to explicitly cast to a numeric type') - def _get_nearest_indexer(self, target, limit, tolerance): - """ - Get the indexer for the nearest index labels; requires an index with - values that can be subtracted from each other (e.g., not strings or - tuples). + @classmethod + def _coerce_to_ndarray(cls, data): """ - left_indexer = self.get_indexer(target, 'pad', limit=limit) - right_indexer = self.get_indexer(target, 'backfill', limit=limit) + Coerces data to ndarray. - target = np.asarray(target) - left_distances = abs(self.values[left_indexer] - target) - right_distances = abs(self.values[right_indexer] - target) + Converts other iterables to list first and then to array. + Does not touch ndarrays. - op = operator.lt if self.is_monotonic_increasing else operator.le - indexer = np.where(op(left_distances, right_distances) | - (right_indexer == -1), left_indexer, right_indexer) - if tolerance is not None: - indexer = self._filter_indexer_tolerance(target, indexer, - tolerance) - return indexer + Raises + ------ + TypeError + When the data passed in is a scalar. + """ - def _filter_indexer_tolerance(self, target, indexer, tolerance): - distance = abs(self.values[indexer] - target) - indexer = np.where(distance <= tolerance, indexer, -1) - return indexer + if not isinstance(data, (np.ndarray, Index)): + if data is None or is_scalar(data): + cls._scalar_data_error(data) - # -------------------------------------------------------------------- + # other iterable of some kind + if not isinstance(data, (ABCSeries, list, tuple)): + data = list(data) + data = np.asarray(data) + return data - def get_value(self, series, key): + def _coerce_scalar_to_index(self, item): """ - Fast lookup of value from 1-dimensional ndarray. Only use this if you - know what you're doing. + We need to coerce a scalar to a compat for our index type. + + Parameters + ---------- + item : scalar item to coerce """ + dtype = self.dtype - # if we have something that is Index-like, then - # use this, e.g. DatetimeIndex - s = getattr(series, '_values', None) - if isinstance(s, (ExtensionArray, Index)) and is_scalar(key): - # GH 20882, 21257 - # Unify Index and ExtensionArray treatment - # First try to convert the key to a location - # If that fails, raise a KeyError if an integer - # index, otherwise, see if key is an integer, and - # try that - try: - iloc = self.get_loc(key) - return s[iloc] - except KeyError: - if (len(self) > 0 and - (self.holds_integer() or self.is_boolean())): - raise - elif is_integer(key): - return s[key] + if self._is_numeric_dtype and isna(item): + # We can't coerce to the numeric dtype of "self" (unless + # it's float) if there are NaN values in our output. + dtype = None - s = com.values_from_object(series) - k = com.values_from_object(key) + return Index([item], dtype=dtype, **self._get_attributes_dict()) - k = self._convert_scalar_indexer(k, kind='getitem') - try: - return self._engine.get_value(s, k, - tz=getattr(series.dtype, 'tz', None)) - except KeyError as e1: - if len(self) > 0 and (self.holds_integer() or self.is_boolean()): - raise + def _to_safe_for_reshape(self): + """ + Convert to object if we are a categorical. + """ + return self - try: - return libindex.get_value_box(s, key) - except IndexError: - raise - except TypeError: - # generator/iterator-like - if is_iterator(key): - raise InvalidIndexError(key) - else: - raise e1 - except Exception: # pragma: no cover - raise e1 - except TypeError: - # python 3 - if is_scalar(key): # pragma: no cover - raise IndexError(key) - raise InvalidIndexError(key) + def _convert_for_op(self, value): + """ + Convert value to be insertable to ndarray. + """ + return value - def set_value(self, arr, key, value): + def _assert_can_do_op(self, value): """ - Fast lookup of value from 1-dimensional ndarray. + Check value is valid for scalar op. + """ + if not is_scalar(value): + msg = "'value' must be a scalar, passed: {0}" + raise TypeError(msg.format(type(value).__name__)) - Notes - ----- - Only use this if you know what you're doing. + @property + def _has_complex_internals(self): + # to disable groupby tricks in MultiIndex + return False + + def _is_memory_usage_qualified(self): """ - self._engine.set_value(com.values_from_object(arr), - com.values_from_object(key), value) + Return a boolean if we need a qualified .info display. + """ + return self.is_object() - _index_shared_docs['get_indexer_non_unique'] = """ - Compute indexer and mask for new index given the current index. The - indexer should be then used as an input to ndarray.take to align the - current data to the new index. + def is_type_compatible(self, kind): + return kind == self.inferred_type + + _index_shared_docs['__contains__'] = """ + Return a boolean if this key is IN the index. Parameters ---------- - target : %(target_klass)s + key : object Returns ------- - indexer : ndarray of int - Integers from 0 to n - 1 indicating that the index at these - positions matches the corresponding target values. Missing values - in the target are marked by -1. - missing : ndarray of int - An indexer into the target of the values not found. - These correspond to the -1 in the indexer array - """ - - @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) - def get_indexer_non_unique(self, target): - target = ensure_index(target) - if is_categorical(target): - target = target.astype(target.dtype.categories.dtype) - pself, ptarget = self._maybe_promote(target) - if pself is not self or ptarget is not target: - return pself.get_indexer_non_unique(ptarget) - - if self.is_all_dates: - self = Index(self.asi8) - tgt_values = target.asi8 - else: - tgt_values = target._ndarray_values - - indexer, missing = self._engine.get_indexer_non_unique(tgt_values) - return ensure_platform_int(indexer), missing - - def get_indexer_for(self, target, **kwargs): - """ - Guaranteed return of an indexer even when non-unique. - - This dispatches to get_indexer or get_indexer_nonunique - as appropriate. + boolean """ - if self.is_unique: - return self.get_indexer(target, **kwargs) - indexer, _ = self.get_indexer_non_unique(target, **kwargs) - return indexer - def _maybe_promote(self, other): - # A hack, but it works - from pandas import DatetimeIndex - if self.inferred_type == 'date' and isinstance(other, DatetimeIndex): - return DatetimeIndex(self), other - elif self.inferred_type == 'boolean': - if not is_object_dtype(self.dtype): - return self.astype('object'), other.astype('object') - return self, other + @Appender(_index_shared_docs['__contains__'] % _index_doc_kwargs) + def __contains__(self, key): + hash(key) + try: + return key in self._engine + except (OverflowError, TypeError, ValueError): + return False - def groupby(self, values): - """ - Group the index labels by a given array of values. + _index_shared_docs['contains'] = """ + Return a boolean if this key is IN the index. Parameters ---------- - values : array - Values used to determine the groups. + key : object Returns ------- - groups : dict - {group name -> group labels} + boolean """ - # TODO: if we are a MultiIndex, we can do better - # that converting to tuples - from .multi import MultiIndex - if isinstance(values, MultiIndex): - values = values.values - values = ensure_categorical(values) - result = values._reverse_indexer() + @Appender(_index_shared_docs['contains'] % _index_doc_kwargs) + def contains(self, key): + hash(key) + try: + return key in self._engine + except (TypeError, ValueError): + return False - # map to the label - result = {k: self.take(v) for k, v in compat.iteritems(result)} + def __hash__(self): + raise TypeError("unhashable type: %r" % type(self).__name__) - return result + def __setitem__(self, key, value): + raise TypeError("Index does not support mutable operations") - def map(self, mapper, na_action=None): + def __getitem__(self, key): """ - Map values using input correspondence (a dict, Series, or function). + Override numpy.ndarray's __getitem__ method to work as desired. - Parameters - ---------- - mapper : function, dict, or Series - Mapping correspondence. - na_action : {None, 'ignore'} - If 'ignore', propagate NA values, without passing them to the - mapping correspondence. + This function adds lists and Series as valid boolean indexers + (ndarrays only supports ndarray with dtype=bool). + + If resulting ndim != 1, plain ndarray is returned instead of + corresponding `Index` subclass. - Returns - ------- - applied : Union[Index, MultiIndex], inferred - The output of the mapping function applied to the index. - If the function returns a tuple with more than one element - a MultiIndex will be returned. """ + # There's no custom logic to be implemented in __getslice__, so it's + # not overloaded intentionally. + getitem = self._data.__getitem__ + promote = self._shallow_copy - from .multi import MultiIndex - new_values = super(Index, self)._map_values( - mapper, na_action=na_action) + if is_scalar(key): + key = com.cast_scalar_indexer(key) + return getitem(key) - attributes = self._get_attributes_dict() + if isinstance(key, slice): + # This case is separated from the conditional above to avoid + # pessimization of basic indexing. + return promote(getitem(key)) - # we can return a MultiIndex - if new_values.size and isinstance(new_values[0], tuple): - if isinstance(self, MultiIndex): - names = self.names - elif attributes.get('name'): - names = [attributes.get('name')] * len(new_values[0]) - else: - names = None - return MultiIndex.from_tuples(new_values, - names=names) + if com.is_bool_indexer(key): + key = np.asarray(key, dtype=bool) - attributes['copy'] = False - if not new_values.size: - # empty - attributes['dtype'] = self.dtype + key = com.values_from_object(key) + result = getitem(key) + if not is_scalar(result): + return promote(result) + else: + return result - return Index(new_values, **attributes) + def _can_hold_identifiers_and_holds_name(self, name): + """ + Faster check for ``name in self`` when we know `name` is a Python + identifier (e.g. in NDFrame.__getattr__, which hits this to support + . key lookup). For indexes that can't hold identifiers (everything + but object & categorical) we just return False. - def isin(self, values, level=None): + https://github.com/pandas-dev/pandas/issues/19764 """ - Return a boolean array where the index values are in `values`. + if self.is_object() or self.is_categorical(): + return name in self + return False - Compute boolean array of whether each index value is found in the - passed set of values. The length of the returned boolean array matches - the length of the index. + def append(self, other): + """ + Append a collection of Index options together. Parameters ---------- - values : set or list-like - Sought values. - - .. versionadded:: 0.18.1 - - Support for values as a set. - - level : str or int, optional - Name or position of the index level to use (if the index is a - `MultiIndex`). + other : Index or list/tuple of indices Returns ------- - is_contained : ndarray - NumPy array of boolean values. - - See Also - -------- - Series.isin : Same for Series. - DataFrame.isin : Same method for DataFrames. + appended : Index + """ - Notes - ----- - In the case of `MultiIndex` you must either specify `values` as a - list-like object containing tuples that are the same length as the - number of levels, or specify `level`. Otherwise it will raise a - ``ValueError``. + to_concat = [self] - If `level` is specified: + if isinstance(other, (list, tuple)): + to_concat = to_concat + list(other) + else: + to_concat.append(other) - - if it is the name of one *and only one* index level, use that level; - - otherwise it should be a number indicating level position. + for obj in to_concat: + if not isinstance(obj, Index): + raise TypeError('all inputs must be Index') - Examples - -------- - >>> idx = pd.Index([1,2,3]) - >>> idx - Int64Index([1, 2, 3], dtype='int64') + names = {obj.name for obj in to_concat} + name = None if len(names) > 1 else self.name - Check whether each index value in a list of values. - >>> idx.isin([1, 4]) - array([ True, False, False]) + return self._concat(to_concat, name) - >>> midx = pd.MultiIndex.from_arrays([[1,2,3], - ... ['red', 'blue', 'green']], - ... names=('number', 'color')) - >>> midx - MultiIndex(levels=[[1, 2, 3], ['blue', 'green', 'red']], - labels=[[0, 1, 2], [2, 0, 1]], - names=['number', 'color']) + def _concat(self, to_concat, name): - Check whether the strings in the 'color' level of the MultiIndex - are in a list of colors. + typs = _concat.get_dtype_kinds(to_concat) - >>> midx.isin(['red', 'orange', 'yellow'], level='color') - array([ True, False, False]) + if len(typs) == 1: + return self._concat_same_dtype(to_concat, name=name) + return _concat._concat_index_asobject(to_concat, name=name) - To check across the levels of a MultiIndex, pass a list of tuples: + def _concat_same_dtype(self, to_concat, name): + """ + Concatenate to_concat which has the same class. + """ + # must be overridden in specific classes + return _concat._concat_index_asobject(to_concat, name) - >>> midx.isin([(1, 'red'), (3, 'red')]) - array([ True, False, False]) + def putmask(self, mask, value): + """ + Return a new Index of the values set with the mask. - For a DatetimeIndex, string values in `values` are converted to - Timestamps. + See Also + -------- + numpy.ndarray.putmask + """ + values = self.values.copy() + try: + np.putmask(values, mask, self._convert_for_op(value)) + return self._shallow_copy(values) + except (ValueError, TypeError) as err: + if is_object_dtype(self): + raise err - >>> dates = ['2000-03-11', '2000-03-12', '2000-03-13'] - >>> dti = pd.to_datetime(dates) - >>> dti - DatetimeIndex(['2000-03-11', '2000-03-12', '2000-03-13'], - dtype='datetime64[ns]', freq=None) + # coerces to object + return self.astype(object).putmask(mask, value) - >>> dti.isin(['2000-03-11']) - array([ True, False, False]) + def equals(self, other): """ - if level is not None: - self._validate_index_level(level) - return algos.isin(self, values) + Determines if two Index objects contain the same elements. + """ + if self.is_(other): + return True - # -------------------------------------------------------------------- - # Reindex Methods + if not isinstance(other, Index): + return False - def _can_reindex(self, indexer): - """ - Check if we are allowing reindexing with this particular indexer. + if is_object_dtype(self) and not is_object_dtype(other): + # if other is not object, use other's logic for coercion + return other.equals(self) - Parameters - ---------- - indexer : an integer indexer + try: + return array_equivalent(com.values_from_object(self), + com.values_from_object(other)) + except Exception: + return False - Raises - ------ - ValueError if its a duplicate axis + def identical(self, other): """ + Similar to equals, but check that other comparable attributes are + also equal. + """ + return (self.equals(other) and + all((getattr(self, c, None) == getattr(other, c, None) + for c in self._comparables)) and + type(self) == type(other)) - # trying to reindex on an axis with duplicates - if not self.is_unique and len(indexer): - raise ValueError("cannot reindex from a duplicate axis") - - def reindex(self, target, method=None, level=None, limit=None, - tolerance=None): + def asof(self, label): """ - Create index with target's values (move/add/delete values - as necessary). + Return the label from the index, or, if not present, the previous one. + + Assuming that the index is sorted, return the passed index label if it + is in the index, or return the previous index label if the passed one + is not in the index. Parameters ---------- - target : an iterable + label : object + The label up to which the method returns the latest index label. Returns ------- - new_index : pd.Index - Resulting index - indexer : np.ndarray or None - Indices of output values in original index + object + The passed label if it is in the index. The previous label if the + passed label is not in the sorted index or `NaN` if there is no + such label. - """ - # GH6552: preserve names when reindexing to non-named target - # (i.e. neither Index nor Series). - preserve_names = not hasattr(target, 'name') + See Also + -------- + Series.asof : Return the latest value in a Series up to the + passed index. + merge_asof : Perform an asof merge (similar to left join but it + matches on nearest key rather than equal key). + Index.get_loc : An `asof` is a thin wrapper around `get_loc` + with method='pad'. - # GH7774: preserve dtype/tz if target is empty and not an Index. - target = _ensure_has_len(target) # target may be an iterator + Examples + -------- + `Index.asof` returns the latest index label up to the passed label. - if not isinstance(target, Index) and len(target) == 0: - attrs = self._get_attributes_dict() - attrs.pop('freq', None) # don't preserve freq - values = self._data[:0] # appropriately-dtyped empty array - target = self._simple_new(values, dtype=self.dtype, **attrs) - else: - target = ensure_index(target) + >>> idx = pd.Index(['2013-12-31', '2014-01-02', '2014-01-03']) + >>> idx.asof('2014-01-01') + '2013-12-31' - if level is not None: - if method is not None: - raise TypeError('Fill method not supported if level passed') - _, indexer, _ = self._join_level(target, level, how='right', - return_indexers=True) + If the label is in the index, the method returns the passed label. + + >>> idx.asof('2014-01-02') + '2014-01-02' + + If all of the labels in the index are later than the passed label, + NaN is returned. + + >>> idx.asof('1999-01-02') + nan + + If the index is not sorted, an error is raised. + + >>> idx_not_sorted = pd.Index(['2013-12-31', '2015-01-02', + ... '2014-01-03']) + >>> idx_not_sorted.asof('2013-12-31') + Traceback (most recent call last): + ValueError: index must be monotonic increasing or decreasing + """ + try: + loc = self.get_loc(label, method='pad') + except KeyError: + return self._na_value else: - if self.equals(target): - indexer = None - else: + if isinstance(loc, slice): + loc = loc.indices(len(self))[-1] + return self[loc] - if self.is_unique: - indexer = self.get_indexer(target, method=method, - limit=limit, - tolerance=tolerance) - else: - if method is not None or limit is not None: - raise ValueError("cannot reindex a non-unique index " - "with a method or limit") - indexer, missing = self.get_indexer_non_unique(target) + def asof_locs(self, where, mask): + """ + Finds the locations (indices) of the labels from the index for + every entry in the `where` argument. - if preserve_names and target.nlevels == 1 and target.name != self.name: - target = target.copy() - target.name = self.name + As in the `asof` function, if the label (a particular entry in + `where`) is not in the index, the latest index label upto the + passed label is chosen and its index returned. - return target, indexer + If all of the labels in the index are later than a label in `where`, + -1 is returned. - def _reindex_non_unique(self, target): - """ - Create a new index with target's values (move/add/delete values as - necessary) use with non-unique Index and a possibly non-unique target. + `mask` is used to ignore NA values in the index during calculation. Parameters ---------- - target : an iterable + where : Index + An Index consisting of an array of timestamps. + mask : array-like + Array of booleans denoting where values in the original + data are not NA. Returns ------- - new_index : pd.Index - Resulting index - indexer : np.ndarray or None - Indices of output values in original index - + numpy.ndarray + An array of locations (indices) of the labels from the Index + which correspond to the return values of the `asof` function + for every element in `where`. """ + locs = self.values[mask].searchsorted(where.values, side='right') + locs = np.where(locs > 0, locs - 1, 0) - target = ensure_index(target) - indexer, missing = self.get_indexer_non_unique(target) - check = indexer != -1 - new_labels = self.take(indexer[check]) - new_indexer = None - - if len(missing): - length = np.arange(len(indexer)) - - missing = ensure_platform_int(missing) - missing_labels = target.take(missing) - missing_indexer = ensure_int64(length[~check]) - cur_labels = self.take(indexer[check]).values - cur_indexer = ensure_int64(length[check]) + result = np.arange(len(self))[mask].take(locs) - new_labels = np.empty(tuple([len(indexer)]), dtype=object) - new_labels[cur_indexer] = cur_labels - new_labels[missing_indexer] = missing_labels + first = mask.argmax() + result[(locs == 0) & (where.values < self.values[first])] = -1 - # a unique indexer - if target.is_unique: + return result - # see GH5553, make sure we use the right indexer - new_indexer = np.arange(len(indexer)) - new_indexer[cur_indexer] = np.arange(len(cur_labels)) - new_indexer[missing_indexer] = -1 + def sort_values(self, return_indexer=False, ascending=True): + """ + Return a sorted copy of the index. - # we have a non_unique selector, need to use the original - # indexer here - else: + Return a sorted copy of the index, and optionally return the indices + that sorted the index itself. - # need to retake to have the same size as the indexer - indexer[~check] = -1 + Parameters + ---------- + return_indexer : bool, default False + Should the indices that would sort the index be returned. + ascending : bool, default True + Should the index values be sorted in an ascending order. - # reset the new indexer to account for the new size - new_indexer = np.arange(len(self.take(indexer))) - new_indexer[~check] = -1 + Returns + ------- + sorted_index : pandas.Index + Sorted copy of the index. + indexer : numpy.ndarray, optional + The indices that the index itself was sorted by. - new_index = self._shallow_copy_with_infer(new_labels, freq=None) - return new_index, indexer, new_indexer + See Also + -------- + pandas.Series.sort_values : Sort values of a Series. + pandas.DataFrame.sort_values : Sort values in a DataFrame. - # -------------------------------------------------------------------- - # Join Methods + Examples + -------- + >>> idx = pd.Index([10, 100, 1, 1000]) + >>> idx + Int64Index([10, 100, 1, 1000], dtype='int64') - _index_shared_docs['join'] = """ - Compute join_index and indexers to conform data - structures to the new index. + Sort values in ascending order (default behavior). - Parameters - ---------- - other : Index - how : {'left', 'right', 'inner', 'outer'} - level : int or level name, default None - return_indexers : boolean, default False - sort : boolean, default False - Sort the join keys lexicographically in the result Index. If False, - the order of the join keys depends on the join type (how keyword) + >>> idx.sort_values() + Int64Index([1, 10, 100, 1000], dtype='int64') - .. versionadded:: 0.20.0 + Sort values in descending order, and also get the indices `idx` was + sorted by. - Returns - ------- - join_index, (left_indexer, right_indexer) + >>> idx.sort_values(ascending=False, return_indexer=True) + (Int64Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2])) """ + _as = self.argsort() + if not ascending: + _as = _as[::-1] - @Appender(_index_shared_docs['join']) - def join(self, other, how='left', level=None, return_indexers=False, - sort=False): - from .multi import MultiIndex - self_is_mi = isinstance(self, MultiIndex) - other_is_mi = isinstance(other, MultiIndex) + sorted_index = self.take(_as) - # try to figure out the join level - # GH3662 - if level is None and (self_is_mi or other_is_mi): + if return_indexer: + return sorted_index, _as + else: + return sorted_index - # have the same levels/names so a simple join - if self.names == other.names: - pass - else: - return self._join_multi(other, how=how, - return_indexers=return_indexers) + def sort(self, *args, **kwargs): + raise TypeError("cannot sort an Index object in-place, use " + "sort_values instead") - # join on the level - if level is not None and (self_is_mi or other_is_mi): - return self._join_level(other, level, how=how, - return_indexers=return_indexers) + def shift(self, periods=1, freq=None): + """ + Shift index by desired number of time frequency increments. - other = ensure_index(other) + This method is for shifting the values of datetime-like indexes + by a specified time increment a given number of times. - if len(other) == 0 and how in ('left', 'outer'): - join_index = self._shallow_copy() - if return_indexers: - rindexer = np.repeat(-1, len(join_index)) - return join_index, None, rindexer - else: - return join_index + Parameters + ---------- + periods : int, default 1 + Number of periods (or increments) to shift by, + can be positive or negative. + freq : pandas.DateOffset, pandas.Timedelta or string, optional + Frequency increment to shift by. + If None, the index is shifted by its own `freq` attribute. + Offset aliases are valid strings, e.g., 'D', 'W', 'M' etc. - if len(self) == 0 and how in ('right', 'outer'): - join_index = other._shallow_copy() - if return_indexers: - lindexer = np.repeat(-1, len(join_index)) - return join_index, lindexer, None - else: - return join_index + Returns + ------- + pandas.Index + shifted index - if self._join_precedence < other._join_precedence: - how = {'right': 'left', 'left': 'right'}.get(how, how) - result = other.join(self, how=how, level=level, - return_indexers=return_indexers) - if return_indexers: - x, y, z = result - result = x, z, y - return result + See Also + -------- + Series.shift : Shift values of Series. - if not is_dtype_equal(self.dtype, other.dtype): - this = self.astype('O') - other = other.astype('O') - return this.join(other, how=how, return_indexers=return_indexers) + Examples + -------- + Put the first 5 month starts of 2011 into an index. - _validate_join_method(how) + >>> month_starts = pd.date_range('1/1/2011', periods=5, freq='MS') + >>> month_starts + DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01', '2011-04-01', + '2011-05-01'], + dtype='datetime64[ns]', freq='MS') - if not self.is_unique and not other.is_unique: - return self._join_non_unique(other, how=how, - return_indexers=return_indexers) - elif not self.is_unique or not other.is_unique: - if self.is_monotonic and other.is_monotonic: - return self._join_monotonic(other, how=how, - return_indexers=return_indexers) - else: - return self._join_non_unique(other, how=how, - return_indexers=return_indexers) - elif self.is_monotonic and other.is_monotonic: - try: - return self._join_monotonic(other, how=how, - return_indexers=return_indexers) - except TypeError: - pass + Shift the index by 10 days. - if how == 'left': - join_index = self - elif how == 'right': - join_index = other - elif how == 'inner': - join_index = self.intersection(other) - elif how == 'outer': - join_index = self.union(other) + >>> month_starts.shift(10, freq='D') + DatetimeIndex(['2011-01-11', '2011-02-11', '2011-03-11', '2011-04-11', + '2011-05-11'], + dtype='datetime64[ns]', freq=None) - if sort: - join_index = join_index.sort_values() + The default value of `freq` is the `freq` attribute of the index, + which is 'MS' (month start) in this example. - if return_indexers: - if join_index is self: - lindexer = None - else: - lindexer = self.get_indexer(join_index) - if join_index is other: - rindexer = None - else: - rindexer = other.get_indexer(join_index) - return join_index, lindexer, rindexer - else: - return join_index + >>> month_starts.shift(10) + DatetimeIndex(['2011-11-01', '2011-12-01', '2012-01-01', '2012-02-01', + '2012-03-01'], + dtype='datetime64[ns]', freq='MS') - def _join_multi(self, other, how, return_indexers=True): - from .multi import MultiIndex - from pandas.core.reshape.merge import _restore_dropped_levels_multijoin + Notes + ----- + This method is only implemented for datetime-like index classes, + i.e., DatetimeIndex, PeriodIndex and TimedeltaIndex. + """ + raise NotImplementedError("Not supported for type %s" % + type(self).__name__) - # figure out join names - self_names = set(com._not_none(*self.names)) - other_names = set(com._not_none(*other.names)) - overlap = self_names & other_names + def argsort(self, *args, **kwargs): + """ + Return the integer indices that would sort the index. - # need at least 1 in common - if not overlap: - raise ValueError("cannot join with no overlapping index names") + Parameters + ---------- + *args + Passed to `numpy.ndarray.argsort`. + **kwargs + Passed to `numpy.ndarray.argsort`. - self_is_mi = isinstance(self, MultiIndex) - other_is_mi = isinstance(other, MultiIndex) + Returns + ------- + numpy.ndarray + Integer indices that would sort the index if used as + an indexer. - if self_is_mi and other_is_mi: + See Also + -------- + numpy.argsort : Similar method for NumPy arrays. + Index.sort_values : Return sorted copy of Index. - # Drop the non-matching levels from left and right respectively - ldrop_names = list(self_names - overlap) - rdrop_names = list(other_names - overlap) + Examples + -------- + >>> idx = pd.Index(['b', 'a', 'd', 'c']) + >>> idx + Index(['b', 'a', 'd', 'c'], dtype='object') - self_jnlevels = self.droplevel(ldrop_names) - other_jnlevels = other.droplevel(rdrop_names) + >>> order = idx.argsort() + >>> order + array([1, 0, 3, 2]) - # Join left and right - # Join on same leveled multi-index frames is supported - join_idx, lidx, ridx = self_jnlevels.join(other_jnlevels, how, - return_indexers=True) + >>> idx[order] + Index(['a', 'b', 'c', 'd'], dtype='object') + """ + result = self.asi8 + if result is None: + result = np.array(self) + return result.argsort(*args, **kwargs) - # Restore the dropped levels - # Returned index level order is - # common levels, ldrop_names, rdrop_names - dropped_names = ldrop_names + rdrop_names + def get_value(self, series, key): + """ + Fast lookup of value from 1-dimensional ndarray. Only use this if you + know what you're doing. + """ - levels, labels, names = ( - _restore_dropped_levels_multijoin(self, other, - dropped_names, - join_idx, - lidx, ridx)) + # if we have something that is Index-like, then + # use this, e.g. DatetimeIndex + s = getattr(series, '_values', None) + if isinstance(s, (ExtensionArray, Index)) and is_scalar(key): + # GH 20882, 21257 + # Unify Index and ExtensionArray treatment + # First try to convert the key to a location + # If that fails, raise a KeyError if an integer + # index, otherwise, see if key is an integer, and + # try that + try: + iloc = self.get_loc(key) + return s[iloc] + except KeyError: + if (len(self) > 0 and + (self.holds_integer() or self.is_boolean())): + raise + elif is_integer(key): + return s[key] - # Re-create the multi-index - multi_join_idx = MultiIndex(levels=levels, labels=labels, - names=names, verify_integrity=False) + s = com.values_from_object(series) + k = com.values_from_object(key) - multi_join_idx = multi_join_idx.remove_unused_levels() + k = self._convert_scalar_indexer(k, kind='getitem') + try: + return self._engine.get_value(s, k, + tz=getattr(series.dtype, 'tz', None)) + except KeyError as e1: + if len(self) > 0 and (self.holds_integer() or self.is_boolean()): + raise - return multi_join_idx, lidx, ridx + try: + return libindex.get_value_box(s, key) + except IndexError: + raise + except TypeError: + # generator/iterator-like + if is_iterator(key): + raise InvalidIndexError(key) + else: + raise e1 + except Exception: # pragma: no cover + raise e1 + except TypeError: + # python 3 + if is_scalar(key): # pragma: no cover + raise IndexError(key) + raise InvalidIndexError(key) - jl = list(overlap)[0] + def set_value(self, arr, key, value): + """ + Fast lookup of value from 1-dimensional ndarray. - # Case where only one index is multi - # make the indices into mi's that match - flip_order = False - if self_is_mi: - self, other = other, self - flip_order = True - # flip if join method is right or left - how = {'right': 'left', 'left': 'right'}.get(how, how) + Notes + ----- + Only use this if you know what you're doing. + """ + self._engine.set_value(com.values_from_object(arr), + com.values_from_object(key), value) - level = other.names.index(jl) - result = self._join_level(other, level, how=how, - return_indexers=return_indexers) + _index_shared_docs['get_indexer_non_unique'] = """ + Compute indexer and mask for new index given the current index. The + indexer should be then used as an input to ndarray.take to align the + current data to the new index. - if flip_order: - if isinstance(result, tuple): - return result[0], result[2], result[1] - return result + Parameters + ---------- + target : %(target_klass)s - def _join_non_unique(self, other, how='left', return_indexers=False): - from pandas.core.reshape.merge import _get_join_indexers + Returns + ------- + indexer : ndarray of int + Integers from 0 to n - 1 indicating that the index at these + positions matches the corresponding target values. Missing values + in the target are marked by -1. + missing : ndarray of int + An indexer into the target of the values not found. + These correspond to the -1 in the indexer array + """ - left_idx, right_idx = _get_join_indexers([self._ndarray_values], - [other._ndarray_values], - how=how, - sort=True) + @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) + def get_indexer_non_unique(self, target): + target = ensure_index(target) + if is_categorical(target): + target = target.astype(target.dtype.categories.dtype) + pself, ptarget = self._maybe_promote(target) + if pself is not self or ptarget is not target: + return pself.get_indexer_non_unique(ptarget) - left_idx = ensure_platform_int(left_idx) - right_idx = ensure_platform_int(right_idx) + if self.is_all_dates: + self = Index(self.asi8) + tgt_values = target.asi8 + else: + tgt_values = target._ndarray_values - join_index = np.asarray(self._ndarray_values.take(left_idx)) - mask = left_idx == -1 - np.putmask(join_index, mask, other._ndarray_values.take(right_idx)) + indexer, missing = self._engine.get_indexer_non_unique(tgt_values) + return ensure_platform_int(indexer), missing - join_index = self._wrap_joined_index(join_index, other) + def get_indexer_for(self, target, **kwargs): + """ + Guaranteed return of an indexer even when non-unique. - if return_indexers: - return join_index, left_idx, right_idx - else: - return join_index + This dispatches to get_indexer or get_indexer_nonunique + as appropriate. + """ + if self.is_unique: + return self.get_indexer(target, **kwargs) + indexer, _ = self.get_indexer_non_unique(target, **kwargs) + return indexer - def _join_level(self, other, level, how='left', return_indexers=False, - keep_order=True): + def _maybe_promote(self, other): + # A hack, but it works + from pandas import DatetimeIndex + if self.inferred_type == 'date' and isinstance(other, DatetimeIndex): + return DatetimeIndex(self), other + elif self.inferred_type == 'boolean': + if not is_object_dtype(self.dtype): + return self.astype('object'), other.astype('object') + return self, other + + def groupby(self, values): """ - The join method *only* affects the level of the resulting - MultiIndex. Otherwise it just exactly aligns the Index data to the - labels of the level in the MultiIndex. + Group the index labels by a given array of values. - If ```keep_order == True```, the order of the data indexed by the - MultiIndex will not be changed; otherwise, it will tie out - with `other`. + Parameters + ---------- + values : array + Values used to determine the groups. + + Returns + ------- + groups : dict + {group name -> group labels} """ + + # TODO: if we are a MultiIndex, we can do better + # that converting to tuples from .multi import MultiIndex + if isinstance(values, MultiIndex): + values = values.values + values = ensure_categorical(values) + result = values._reverse_indexer() - def _get_leaf_sorter(labels): - """ - Returns sorter for the inner most level while preserving the - order of higher levels. - """ - if labels[0].size == 0: - return np.empty(0, dtype='int64') + # map to the label + result = {k: self.take(v) for k, v in compat.iteritems(result)} - if len(labels) == 1: - lab = ensure_int64(labels[0]) - sorter, _ = libalgos.groupsort_indexer(lab, 1 + lab.max()) - return sorter + return result - # find indexers of beginning of each set of - # same-key labels w.r.t all but last level - tic = labels[0][:-1] != labels[0][1:] - for lab in labels[1:-1]: - tic |= lab[:-1] != lab[1:] + def map(self, mapper, na_action=None): + """ + Map values using input correspondence (a dict, Series, or function). - starts = np.hstack(([True], tic, [True])).nonzero()[0] - lab = ensure_int64(labels[-1]) - return lib.get_level_sorter(lab, ensure_int64(starts)) + Parameters + ---------- + mapper : function, dict, or Series + Mapping correspondence. + na_action : {None, 'ignore'} + If 'ignore', propagate NA values, without passing them to the + mapping correspondence. - if isinstance(self, MultiIndex) and isinstance(other, MultiIndex): - raise TypeError('Join on level between two MultiIndex objects ' - 'is ambiguous') + Returns + ------- + applied : Union[Index, MultiIndex], inferred + The output of the mapping function applied to the index. + If the function returns a tuple with more than one element + a MultiIndex will be returned. + """ - left, right = self, other + from .multi import MultiIndex + new_values = super(Index, self)._map_values( + mapper, na_action=na_action) - flip_order = not isinstance(self, MultiIndex) - if flip_order: - left, right = right, left - how = {'right': 'left', 'left': 'right'}.get(how, how) + attributes = self._get_attributes_dict() - level = left._get_level_number(level) - old_level = left.levels[level] + # we can return a MultiIndex + if new_values.size and isinstance(new_values[0], tuple): + if isinstance(self, MultiIndex): + names = self.names + elif attributes.get('name'): + names = [attributes.get('name')] * len(new_values[0]) + else: + names = None + return MultiIndex.from_tuples(new_values, + names=names) - if not right.is_unique: - raise NotImplementedError('Index._join_level on non-unique index ' - 'is not implemented') + attributes['copy'] = False + if not new_values.size: + # empty + attributes['dtype'] = self.dtype - new_level, left_lev_indexer, right_lev_indexer = \ - old_level.join(right, how=how, return_indexers=True) + return Index(new_values, **attributes) - if left_lev_indexer is None: - if keep_order or len(left) == 0: - left_indexer = None - join_index = left - else: # sort the leaves - left_indexer = _get_leaf_sorter(left.labels[:level + 1]) - join_index = left[left_indexer] + def isin(self, values, level=None): + """ + Return a boolean array where the index values are in `values`. - else: - left_lev_indexer = ensure_int64(left_lev_indexer) - rev_indexer = lib.get_reverse_indexer(left_lev_indexer, - len(old_level)) + Compute boolean array of whether each index value is found in the + passed set of values. The length of the returned boolean array matches + the length of the index. - new_lev_labels = algos.take_nd(rev_indexer, left.labels[level], - allow_fill=False) + Parameters + ---------- + values : set or list-like + Sought values. - new_labels = list(left.labels) - new_labels[level] = new_lev_labels + .. versionadded:: 0.18.1 - new_levels = list(left.levels) - new_levels[level] = new_level + Support for values as a set. - if keep_order: # just drop missing values. o.w. keep order - left_indexer = np.arange(len(left), dtype=np.intp) - mask = new_lev_labels != -1 - if not mask.all(): - new_labels = [lab[mask] for lab in new_labels] - left_indexer = left_indexer[mask] + level : str or int, optional + Name or position of the index level to use (if the index is a + `MultiIndex`). - else: # tie out the order with other - if level == 0: # outer most level, take the fast route - ngroups = 1 + new_lev_labels.max() - left_indexer, counts = libalgos.groupsort_indexer( - new_lev_labels, ngroups) + Returns + ------- + is_contained : ndarray + NumPy array of boolean values. - # missing values are placed first; drop them! - left_indexer = left_indexer[counts[0]:] - new_labels = [lab[left_indexer] for lab in new_labels] + See Also + -------- + Series.isin : Same for Series. + DataFrame.isin : Same method for DataFrames. - else: # sort the leaves - mask = new_lev_labels != -1 - mask_all = mask.all() - if not mask_all: - new_labels = [lab[mask] for lab in new_labels] + Notes + ----- + In the case of `MultiIndex` you must either specify `values` as a + list-like object containing tuples that are the same length as the + number of levels, or specify `level`. Otherwise it will raise a + ``ValueError``. - left_indexer = _get_leaf_sorter(new_labels[:level + 1]) - new_labels = [lab[left_indexer] for lab in new_labels] + If `level` is specified: - # left_indexers are w.r.t masked frame. - # reverse to original frame! - if not mask_all: - left_indexer = mask.nonzero()[0][left_indexer] + - if it is the name of one *and only one* index level, use that level; + - otherwise it should be a number indicating level position. - join_index = MultiIndex(levels=new_levels, labels=new_labels, - names=left.names, verify_integrity=False) + Examples + -------- + >>> idx = pd.Index([1,2,3]) + >>> idx + Int64Index([1, 2, 3], dtype='int64') - if right_lev_indexer is not None: - right_indexer = algos.take_nd(right_lev_indexer, - join_index.labels[level], - allow_fill=False) - else: - right_indexer = join_index.labels[level] + Check whether each index value in a list of values. + >>> idx.isin([1, 4]) + array([ True, False, False]) - if flip_order: - left_indexer, right_indexer = right_indexer, left_indexer + >>> midx = pd.MultiIndex.from_arrays([[1,2,3], + ... ['red', 'blue', 'green']], + ... names=('number', 'color')) + >>> midx + MultiIndex(levels=[[1, 2, 3], ['blue', 'green', 'red']], + labels=[[0, 1, 2], [2, 0, 1]], + names=['number', 'color']) - if return_indexers: - left_indexer = (None if left_indexer is None - else ensure_platform_int(left_indexer)) - right_indexer = (None if right_indexer is None - else ensure_platform_int(right_indexer)) - return join_index, left_indexer, right_indexer - else: - return join_index + Check whether the strings in the 'color' level of the MultiIndex + are in a list of colors. - def _join_monotonic(self, other, how='left', return_indexers=False): - if self.equals(other): - ret_index = other if how == 'right' else self - if return_indexers: - return ret_index, None, None - else: - return ret_index + >>> midx.isin(['red', 'orange', 'yellow'], level='color') + array([ True, False, False]) - sv = self._ndarray_values - ov = other._ndarray_values + To check across the levels of a MultiIndex, pass a list of tuples: - if self.is_unique and other.is_unique: - # We can perform much better than the general case - if how == 'left': - join_index = self - lidx = None - ridx = self._left_indexer_unique(sv, ov) - elif how == 'right': - join_index = other - lidx = self._left_indexer_unique(ov, sv) - ridx = None - elif how == 'inner': - join_index, lidx, ridx = self._inner_indexer(sv, ov) - join_index = self._wrap_joined_index(join_index, other) - elif how == 'outer': - join_index, lidx, ridx = self._outer_indexer(sv, ov) - join_index = self._wrap_joined_index(join_index, other) - else: - if how == 'left': - join_index, lidx, ridx = self._left_indexer(sv, ov) - elif how == 'right': - join_index, ridx, lidx = self._left_indexer(ov, sv) - elif how == 'inner': - join_index, lidx, ridx = self._inner_indexer(sv, ov) - elif how == 'outer': - join_index, lidx, ridx = self._outer_indexer(sv, ov) - join_index = self._wrap_joined_index(join_index, other) + >>> midx.isin([(1, 'red'), (3, 'red')]) + array([ True, False, False]) - if return_indexers: - lidx = None if lidx is None else ensure_platform_int(lidx) - ridx = None if ridx is None else ensure_platform_int(ridx) - return join_index, lidx, ridx - else: - return join_index + For a DatetimeIndex, string values in `values` are converted to + Timestamps. - def _wrap_joined_index(self, joined, other): - name = get_op_result_name(self, other) - return Index(joined, name=name) + >>> dates = ['2000-03-11', '2000-03-12', '2000-03-13'] + >>> dti = pd.to_datetime(dates) + >>> dti + DatetimeIndex(['2000-03-11', '2000-03-12', '2000-03-13'], + dtype='datetime64[ns]', freq=None) - # -------------------------------------------------------------------- + >>> dti.isin(['2000-03-11']) + array([ True, False, False]) + """ + if level is not None: + self._validate_index_level(level) + return algos.isin(self, values) def _get_string_slice(self, key, use_lhs=True, use_rhs=True): # this is for partial string indexing,