diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 1749409c863df..688935c6b104d 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -594,6 +594,95 @@ faster than fancy indexing. timeit ser.ix[indexer] timeit ser.take(indexer) +.. _indexing.categoricalindex: + +CategoricalIndex +---------------- + +.. versionadded:: 0.16.1 + +We introduce a ``CategoricalIndex``, a new type of index object that is useful for supporting +indexing with duplicates. This is a container around a ``Categorical`` (introduced in v0.15.0) +and allows efficient indexing and storage of an index with a large number of duplicated elements. Prior to 0.16.1, +setting the index of a ``DataFrame/Series`` with a ``category`` dtype would convert this to regular object-based ``Index``. + +.. ipython:: python + + df = DataFrame({'A' : np.arange(6), + 'B' : Series(list('aabbca')).astype('category', + categories=list('cab')) + }) + df + df.dtypes + df.B.cat.categories + +Setting the index, will create create a ``CategoricalIndex`` + +.. ipython:: python + + df2 = df.set_index('B') + df2.index + +Indexing with ``__getitem__/.iloc/.loc/.ix`` works similarly to an ``Index`` with duplicates. +The indexers MUST be in the category or the operation will raise. + +.. ipython:: python + + df2.loc['a'] + +These PRESERVE the ``CategoricalIndex`` + +.. ipython:: python + + df2.loc['a'].index + +Sorting will order by the order of the categories + +.. ipython:: python + + df2.sort_index() + +Groupby operations on the index will preserve the index nature as well + +.. ipython:: python + + df2.groupby(level=0).sum() + df2.groupby(level=0).sum().index + +Reindexing operations, will return a resulting index based on the type of the passed +indexer, meaning that passing a list will return a plain-old-``Index``; indexing with +a ``Categorical`` will return a ``CategoricalIndex``, indexed according to the categories +of the PASSED ``Categorical`` dtype. This allows one to arbitrarly index these even with +values NOT in the categories, similarly to how you can reindex ANY pandas index. + +.. ipython :: python + + df2.reindex(['a','e']) + df2.reindex(['a','e']).index + df2.reindex(pd.Categorical(['a','e'],categories=list('abcde'))) + df2.reindex(pd.Categorical(['a','e'],categories=list('abcde'))).index + +.. warning:: + + Reshaping and Comparision operations on a ``CategoricalIndex`` must have the same categories + or a ``TypeError`` will be raised. + + .. code-block:: python + + In [10]: df3 = DataFrame({'A' : np.arange(6), + 'B' : Series(list('aabbca')).astype('category', + categories=list('abc')) + }).set_index('B') + + In [11]: df3.index + Out[11]: + CategoricalIndex([u'a', u'a', u'b', u'b', u'c', u'a'], + categories=[u'a', u'b', u'c'], + ordered=False) + + In [12]: pd.concat([df2,df3] + TypeError: categories must match existing categories when appending + .. _indexing.float64index: Float64Index @@ -706,4 +795,3 @@ Of course if you need integer based selection, then use ``iloc`` .. ipython:: python dfir.iloc[0:5] - diff --git a/doc/source/api.rst b/doc/source/api.rst index af9f8c84388bd..b1540ff528605 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1291,6 +1291,34 @@ Selecting Index.slice_indexer Index.slice_locs +.. _api.categoricalindex: + +CategoricalIndex +---------------- + +.. autosummary:: + :toctree: generated/ + + CategoricalIndex + +Categorical Components +~~~~~~~~~~~~~~~~~~~~~~ + +.. autosummary:: + :toctree: generated/ + + CategoricalIndex.codes + CategoricalIndex.categories + CategoricalIndex.ordered + CategoricalIndex.rename_categories + CategoricalIndex.reorder_categories + CategoricalIndex.add_categories + CategoricalIndex.remove_categories + CategoricalIndex.remove_unused_categories + CategoricalIndex.set_categories + CategoricalIndex.as_ordered + CategoricalIndex.as_unordered + .. _api.datetimeindex: DatetimeIndex diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index fcb5cd6a5ec30..cbd5ad3f49c18 100755 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -7,6 +7,10 @@ This is a minor bug-fix release from 0.16.0 and includes a a large number of bug fixes along several new features, enhancements, and performance improvements. We recommend that all users upgrade to this version. +Highlights include: + +- Support for a ``CategoricalIndex``, a category based index, see :ref:`here ` + .. contents:: What's new in v0.16.1 :local: :backlinks: none @@ -31,6 +35,7 @@ Enhancements will return a `np.array` instead of a boolean `Index` (:issue:`8875`). This enables the following expression to work naturally: + .. ipython:: python idx = Index(['a1', 'a2', 'b1', 'b2']) @@ -40,6 +45,7 @@ Enhancements s[s.index.str.startswith('a')] - ``DataFrame.mask()`` and ``Series.mask()`` now support same keywords as ``where`` (:issue:`8801`) + - ``drop`` function can now accept ``errors`` keyword to suppress ValueError raised when any of label does not exist in the target data. (:issue:`6736`) .. ipython:: python @@ -58,6 +64,75 @@ Enhancements - ``DataFrame`` and ``Series`` now have ``_constructor_expanddim`` property as overridable constructor for one higher dimensionality data. This should be used only when it is really needed, see :ref:`here ` +.. _whatsnew_0161.enhancements.categoricalindex: + +CategoricalIndex +^^^^^^^^^^^^^^^^ + +We introduce a ``CategoricalIndex``, a new type of index object that is useful for supporting +indexing with duplicates. This is a container around a ``Categorical`` (introduced in v0.15.0) +and allows efficient indexing and storage of an index with a large number of duplicated elements. Prior to 0.16.1, +setting the index of a ``DataFrame/Series`` with a ``category`` dtype would convert this to regular object-based ``Index``. + +.. ipython :: python + + df = DataFrame({'A' : np.arange(6), + 'B' : Series(list('aabbca')).astype('category', + categories=list('cab')) + }) + df + df.dtypes + df.B.cat.categories + +setting the index, will create create a CategoricalIndex + +.. ipython :: python + + df2 = df.set_index('B') + df2.index + +indexing with ``__getitem__/.iloc/.loc/.ix`` works similarly to an Index with duplicates. +The indexers MUST be in the category or the operation will raise. + +.. ipython :: python + + df2.loc['a'] + +and preserves the ``CategoricalIndex`` + +.. ipython :: python + + df2.loc['a'].index + +sorting will order by the order of the categories + +.. ipython :: python + + df2.sort_index() + +groupby operations on the index will preserve the index nature as well + +.. ipython :: python + + df2.groupby(level=0).sum() + df2.groupby(level=0).sum().index + +reindexing operations, will return a resulting index based on the type of the passed +indexer, meaning that passing a list will return a plain-old-``Index``; indexing with +a ``Categorical`` will return a ``CategoricalIndex``, indexed according to the categories +of the PASSED ``Categorical`` dtype. This allows one to arbitrarly index these even with +values NOT in the categories, similarly to how you can reindex ANY pandas index. + +.. ipython :: python + + df2.reindex(['a','e']) + df2.reindex(['a','e']).index + df2.reindex(pd.Categorical(['a','e'],categories=list('abcde'))) + df2.reindex(pd.Categorical(['a','e'],categories=list('abcde'))).index + +See the :ref:`documentation ` for more. (:issue:`7629`) +>>>>>>> support CategoricalIndex + .. _whatsnew_0161.api: API changes diff --git a/pandas/core/api.py b/pandas/core/api.py index a8b10342593ce..fde9bc77c4bd9 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -8,7 +8,7 @@ from pandas.core.categorical import Categorical from pandas.core.groupby import Grouper from pandas.core.format import set_eng_float_format -from pandas.core.index import Index, Int64Index, Float64Index, MultiIndex +from pandas.core.index import Index, CategoricalIndex, Int64Index, Float64Index, MultiIndex from pandas.core.series import Series, TimeSeries from pandas.core.frame import DataFrame diff --git a/pandas/core/base.py b/pandas/core/base.py index a25651a73f507..c0233a5a33308 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -121,7 +121,7 @@ def _delegate_method(self, name, *args, **kwargs): raise TypeError("You cannot call method {name}".format(name=name)) @classmethod - def _add_delegate_accessors(cls, delegate, accessors, typ): + def _add_delegate_accessors(cls, delegate, accessors, typ, overwrite=False): """ add accessors to cls from the delegate class @@ -131,6 +131,8 @@ def _add_delegate_accessors(cls, delegate, accessors, typ): delegate : the class to get methods/properties & doc-strings acccessors : string list of accessors to add typ : 'property' or 'method' + overwrite : boolean, default False + overwrite the method/property in the target class if it exists """ @@ -164,7 +166,7 @@ def f(self, *args, **kwargs): f = _create_delegator_method(name) # don't overwrite existing methods/properties - if not hasattr(cls, name): + if overwrite or not hasattr(cls, name): setattr(cls,name,f) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 0d66a89b0a585..9537523380350 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -9,12 +9,11 @@ from pandas.core.algorithms import factorize from pandas.core.base import PandasObject, PandasDelegate -from pandas.core.index import Index, _ensure_index -from pandas.tseries.period import PeriodIndex import pandas.core.common as com from pandas.util.decorators import cache_readonly -from pandas.core.common import (CategoricalDtype, ABCSeries, isnull, notnull, +from pandas.core.common import (CategoricalDtype, ABCSeries, ABCIndexClass, ABCPeriodIndex, ABCCategoricalIndex, + isnull, notnull, is_dtype_equal, is_categorical_dtype, is_integer_dtype, is_object_dtype, _possibly_infer_to_datetimelike, get_dtype_kinds, is_list_like, is_sequence, is_null_slice, is_bool, @@ -22,7 +21,6 @@ _coerce_indexer_dtype, _values_from_object, take_1d) from pandas.util.terminal import get_terminal_size from pandas.core.config import get_option -from pandas.core import format as fmt def _cat_compare_op(op): def f(self, other): @@ -86,7 +84,7 @@ def f(self, other): def maybe_to_categorical(array): """ coerce to a categorical if a series is given """ - if isinstance(array, ABCSeries): + if isinstance(array, (ABCSeries, ABCCategoricalIndex)): return array.values return array @@ -236,15 +234,17 @@ def __init__(self, values, categories=None, ordered=False, name=None, fastpath=F # sanitize input if is_categorical_dtype(values): - # we are either a Series or a Categorical - cat = values - if isinstance(values, ABCSeries): - cat = values.values + # we are either a Series or a CategoricalIndex + if isinstance(values, (ABCSeries, ABCCategoricalIndex)): + values = values.values + + if ordered is None: + ordered = values.ordered if categories is None: - categories = cat.categories + categories = values.categories values = values.__array__() - elif isinstance(values, Index): + elif isinstance(values, ABCIndexClass): pass else: @@ -295,11 +295,11 @@ def __init__(self, values, categories=None, ordered=False, name=None, fastpath=F warn("Values and categories have different dtypes. Did you mean to use\n" "'Categorical.from_codes(codes, categories)'?", RuntimeWarning) - if is_integer_dtype(values) and (codes == -1).all(): + if len(values) and is_integer_dtype(values) and (codes == -1).all(): warn("None of the categories were found in values. Did you mean to use\n" "'Categorical.from_codes(codes, categories)'?", RuntimeWarning) - self.set_ordered(ordered, inplace=True) + self.set_ordered(ordered or False, inplace=True) self.categories = categories self.name = name self._codes = _coerce_indexer_dtype(codes, categories) @@ -309,11 +309,27 @@ def copy(self): return Categorical(values=self._codes.copy(),categories=self.categories, name=self.name, ordered=self.ordered, fastpath=True) + def astype(self, dtype): + """ coerce this type to another dtype """ + if is_categorical_dtype(dtype): + return self + return np.array(self, dtype=dtype) + @cache_readonly def ndim(self): """Number of dimensions of the Categorical """ return self._codes.ndim + @cache_readonly + def size(self): + """ return the len of myself """ + return len(self) + + @cache_readonly + def itemsize(self): + """ return the size of a single category """ + return self.categories.itemsize + def reshape(self, new_shape, **kwargs): """ compat with .reshape """ return self @@ -395,7 +411,8 @@ def _set_codes(self, codes): codes = property(fget=_get_codes, fset=_set_codes, doc=_codes_doc) def _get_labels(self): - """ Get the category labels (deprecated). + """ + Get the category labels (deprecated). Deprecated, use .codes! """ @@ -409,8 +426,10 @@ def _get_labels(self): @classmethod def _validate_categories(cls, categories): - """" Validates that we have good categories """ - if not isinstance(categories, Index): + """ + Validates that we have good categories + """ + if not isinstance(categories, ABCIndexClass): dtype = None if not hasattr(categories, "dtype"): categories = _convert_to_list_like(categories) @@ -421,6 +440,8 @@ def _validate_categories(cls, categories): with_na = np.array(categories) if with_na.dtype != without_na.dtype: dtype = "object" + + from pandas import Index categories = Index(categories, dtype=dtype) if not categories.is_unique: raise ValueError('Categorical categories must be unique') @@ -761,6 +782,8 @@ def remove_unused_categories(self, inplace=False): cat = self if inplace else self.copy() _used = sorted(np.unique(cat._codes)) new_categories = cat.categories.take(_ensure_platform_int(_used)) + + from pandas.core.index import _ensure_index new_categories = _ensure_index(new_categories) cat._codes = _get_codes_for_values(cat.__array__(), new_categories) cat._categories = new_categories @@ -790,7 +813,8 @@ def shape(self): return tuple([len(self._codes)]) def __array__(self, dtype=None): - """ The numpy array interface. + """ + The numpy array interface. Returns ------- @@ -799,7 +823,7 @@ def __array__(self, dtype=None): dtype as categorical.categories.dtype """ ret = take_1d(self.categories.values, self._codes) - if dtype and dtype != self.categories.dtype: + if dtype and not is_dtype_equal(dtype,self.categories.dtype): return np.asarray(ret, dtype) return ret @@ -997,7 +1021,7 @@ def get_values(self): """ # if we are a period index, return a string repr - if isinstance(self.categories, PeriodIndex): + if isinstance(self.categories, ABCPeriodIndex): return take_1d(np.array(self.categories.to_native_types(), dtype=object), self._codes) @@ -1243,7 +1267,8 @@ def __iter__(self): """Returns an Iterator over the values of this Categorical.""" return iter(np.array(self)) - def _tidy_repr(self, max_vals=10): + def _tidy_repr(self, max_vals=10, footer=True): + """ a short repr displaying only max_vals and an optional (but default footer) """ num = max_vals // 2 head = self[:num]._get_repr(length=False, name=False, footer=False) tail = self[-(max_vals - num):]._get_repr(length=False, @@ -1251,23 +1276,31 @@ def _tidy_repr(self, max_vals=10): footer=False) result = '%s, ..., %s' % (head[:-1], tail[1:]) - result = '%s\n%s' % (result, self._repr_footer()) + if footer: + result = '%s\n%s' % (result, self._repr_footer()) return compat.text_type(result) - def _repr_categories_info(self): - """ Returns a string representation of the footer.""" - + def _repr_categories(self): + """ return the base repr for the categories """ max_categories = (10 if get_option("display.max_categories") == 0 else get_option("display.max_categories")) + from pandas.core import format as fmt category_strs = fmt.format_array(self.categories.get_values(), None) if len(category_strs) > max_categories: num = max_categories // 2 head = category_strs[:num] tail = category_strs[-(max_categories - num):] category_strs = head + ["..."] + tail + # Strip all leading spaces, which format_array adds for columns... category_strs = [x.strip() for x in category_strs] + return category_strs + + def _repr_categories_info(self): + """ Returns a string representation of the footer.""" + + category_strs = self._repr_categories() levheader = "Categories (%d, %s): " % (len(self.categories), self.categories.dtype) width, height = get_terminal_size() @@ -1299,8 +1332,11 @@ def _repr_footer(self): len(self), self._repr_categories_info()) def _get_repr(self, name=False, length=True, na_rep='NaN', footer=True): - formatter = fmt.CategoricalFormatter(self, name=name, - length=length, na_rep=na_rep, + from pandas.core import format as fmt + formatter = fmt.CategoricalFormatter(self, + name=name, + length=length, + na_rep=na_rep, footer=footer) result = formatter.to_string() return compat.text_type(result) @@ -1315,9 +1351,9 @@ def __unicode__(self): name=True) else: result = '[], %s' % self._get_repr(name=True, - length=False, - footer=True, - ).replace("\n",", ") + length=False, + footer=True, + ).replace("\n",", ") return result @@ -1358,6 +1394,8 @@ def __setitem__(self, key, value): "categories") rvalue = value if is_list_like(value) else [value] + + from pandas import Index to_add = Index(rvalue).difference(self.categories) # no assignments of values not in categories, but it's always ok to set something to np.nan @@ -1516,11 +1554,27 @@ def equals(self, other): ------- are_equal : boolean """ - if not isinstance(other, Categorical): - return False # TODO: should this also test if name is equal? - return (self.categories.equals(other.categories) and self.ordered == other.ordered and - np.array_equal(self._codes, other._codes)) + return self.is_dtype_equal(other) and np.array_equal(self._codes, other._codes) + + def is_dtype_equal(self, other): + """ + Returns True if categoricals are the same dtype + same categories, and same ordered + + Parameters + ---------- + other : Categorical + + Returns + ------- + are_equal : boolean + """ + + try: + return self.categories.equals(other.categories) and self.ordered == other.ordered + except (AttributeError, TypeError): + return False def describe(self): """ Describes this Categorical @@ -1604,18 +1658,20 @@ def _delegate_method(self, name, *args, **kwargs): ##### utility routines ##### def _get_codes_for_values(values, categories): - """" + """ utility routine to turn values into codes given the specified categories """ from pandas.core.algorithms import _get_data_algo, _hashtables - if values.dtype != categories.dtype: + if not is_dtype_equal(values.dtype,categories.dtype): values = _ensure_object(values) categories = _ensure_object(categories) + (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables) - t = hash_klass(len(categories)) - t.map_locations(_values_from_object(categories)) - return _coerce_indexer_dtype(t.lookup(values), categories) + (_, _), cats = _get_data_algo(categories, _hashtables) + t = hash_klass(len(cats)) + t.map_locations(cats) + return _coerce_indexer_dtype(t.lookup(vals), cats) def _convert_to_list_like(list_like): if hasattr(list_like, "dtype"): diff --git a/pandas/core/common.py b/pandas/core/common.py index ffe12d0c1546c..3d23aeff942dc 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -83,6 +83,16 @@ def _check(cls, inst): ABCDatetimeIndex = create_pandas_abc_type("ABCDatetimeIndex", "_typ", ("datetimeindex",)) ABCTimedeltaIndex = create_pandas_abc_type("ABCTimedeltaIndex", "_typ", ("timedeltaindex",)) ABCPeriodIndex = create_pandas_abc_type("ABCPeriodIndex", "_typ", ("periodindex",)) +ABCCategoricalIndex = create_pandas_abc_type("ABCCategoricalIndex", "_typ", ("categoricalindex",)) +ABCIndexClass = create_pandas_abc_type("ABCIndexClass", "_typ", ("index", + "int64index", + "float64index", + "multiindex", + "datetimeindex", + "timedeltaindex", + "periodindex", + "categoricalindex")) + ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series",)) ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe",)) ABCPanel = create_pandas_abc_type("ABCPanel", "_typ", ("panel",)) @@ -2455,11 +2465,27 @@ def _get_dtype_type(arr_or_dtype): return np.dtype(arr_or_dtype).type elif isinstance(arr_or_dtype, CategoricalDtype): return CategoricalDtypeType + elif isinstance(arr_or_dtype, compat.string_types): + if is_categorical_dtype(arr_or_dtype): + return CategoricalDtypeType + return _get_dtype_type(np.dtype(arr_or_dtype)) try: return arr_or_dtype.dtype.type except AttributeError: raise ValueError('%r is not a dtype' % arr_or_dtype) +def is_dtype_equal(source, target): + """ return a boolean if the dtypes are equal """ + source = _get_dtype_type(source) + target = _get_dtype_type(target) + + try: + return source == target + except TypeError: + + # invalid comparison + # object == category will hit this + return False def is_any_int_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 4ef3bbce85467..e5b1a96f81677 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -14,7 +14,7 @@ from pandas.core.categorical import Categorical from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame -from pandas.core.index import Index, MultiIndex, _ensure_index, _union_indexes +from pandas.core.index import Index, MultiIndex, CategoricalIndex, _ensure_index, _union_indexes from pandas.core.internals import BlockManager, make_block from pandas.core.series import Series from pandas.core.panel import Panel @@ -1928,7 +1928,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, self.grouper = com._asarray_tuplesafe(self.grouper) # a passed Categorical - elif isinstance(self.grouper, Categorical): + elif is_categorical_dtype(self.grouper): # must have an ordered categorical if self.sort: @@ -1942,8 +1942,15 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, # fix bug #GH8868 sort=False being ignored in categorical groupby else: self.grouper = self.grouper.reorder_categories(self.grouper.unique()) + + # we make a CategoricalIndex out of the cat grouper + # preserving the categories / ordered attributes self._labels = self.grouper.codes - self._group_index = self.grouper.categories + + c = self.grouper.categories + self._group_index = CategoricalIndex(Categorical.from_codes(np.arange(len(c)), + categories=c, + ordered=self.grouper.ordered)) if self.name is None: self.name = self.grouper.name @@ -2131,8 +2138,8 @@ def is_in_obj(gpr): else: in_axis, name = False, None - if isinstance(gpr, Categorical) and len(gpr) != len(obj): - raise ValueError("Categorical grouper must have len(grouper) == len(data)") + if is_categorical_dtype(gpr) and len(gpr) != len(obj): + raise ValueError("Categorical dtype grouper must have len(grouper) == len(data)") ping = Grouping(group_axis, gpr, obj=obj, name=name, level=level, sort=sort, in_axis=in_axis) @@ -3252,7 +3259,7 @@ def _reindex_output(self, result): return result elif len(groupings) == 1: return result - elif not any([isinstance(ping.grouper, Categorical) + elif not any([isinstance(ping.grouper, (Categorical, CategoricalIndex)) for ping in groupings]): return result diff --git a/pandas/core/index.py b/pandas/core/index.py index 8b509c6876ec7..8b650fea9b440 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -2,6 +2,7 @@ import datetime import warnings import operator + from functools import partial from pandas.compat import range, zip, lrange, lzip, u, reduce, filter, map from pandas import compat @@ -13,13 +14,13 @@ import pandas.algos as _algos import pandas.index as _index from pandas.lib import Timestamp, Timedelta, is_datetime_array -from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, _shared_docs +from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, _shared_docs, PandasDelegate from pandas.util.decorators import (Appender, Substitution, cache_readonly, deprecate) -from pandas.core.common import isnull, array_equivalent import pandas.core.common as com -from pandas.core.common import (_values_from_object, is_float, is_integer, - ABCSeries, _ensure_object, _ensure_int64, is_bool_indexer, +from pandas.core.common import (isnull, array_equivalent, is_dtype_equal, is_object_dtype, + _values_from_object, is_float, is_integer, is_iterator, is_categorical_dtype, + ABCSeries, ABCCategorical, _ensure_object, _ensure_int64, is_bool_indexer, is_list_like, is_bool_dtype, is_null_slice, is_integer_dtype) from pandas.core.config import get_option from pandas.io.common import PerformanceWarning @@ -44,26 +45,6 @@ def _try_get_item(x): except AttributeError: return x -def _indexOp(opname): - """ - Wrapper function for index comparison operations, to avoid - code duplication. - """ - def wrapper(self, other): - func = getattr(self.values, opname) - result = func(np.asarray(other)) - - # technically we could support bool dtyped Index - # for now just return the indexing array directly - if is_bool_dtype(result): - return result - try: - return Index(result) - except: # pragma: no cover - return result - return wrapper - - class InvalidIndexError(Exception): pass @@ -162,6 +143,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, return Float64Index(data, copy=copy, dtype=dtype, name=name) elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data): subarr = data.astype('object') + elif is_categorical_dtype(data) or is_categorical_dtype(dtype): + return CategoricalIndex(data, copy=copy, name=name, **kwargs) else: subarr = com._asarray_tuplesafe(data, dtype=object) @@ -170,6 +153,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=False, if copy: subarr = subarr.copy() + elif is_categorical_dtype(data) or is_categorical_dtype(dtype): + return CategoricalIndex(data, copy=copy, name=name, **kwargs) elif hasattr(data, '__array__'): return Index(np.asarray(data), dtype=dtype, copy=copy, name=name, **kwargs) @@ -258,7 +243,7 @@ def __len__(self): """ return len(self._data) - def __array__(self, result=None): + def __array__(self, dtype=None): """ the array interface, return my values """ return self._data.view(np.ndarray) @@ -282,9 +267,6 @@ def get_values(self): """ return the underlying data as an ndarray """ return self.values - def _array_values(self): - return self._data - # ops compat def tolist(self): """ @@ -410,8 +392,7 @@ def __unicode__(self): Invoked by unicode(df) in py2 only. Yields a Unicode String in both py2/py3. """ - prepr = com.pprint_thing(self, escape_chars=('\t', '\r', '\n'), - quote_strings=True) + prepr = default_pprint(self) return "%s(%s, dtype='%s')" % (type(self).__name__, prepr, self.dtype) def to_series(self, **kwargs): @@ -429,9 +410,10 @@ def to_series(self, **kwargs): def _to_embed(self, keep_tz=False): """ + *this is an internal non-public method* + return an array repr of this object, potentially casting to object - This is for internal compat """ return self.values @@ -623,7 +605,10 @@ def is_numeric(self): return self.inferred_type in ['integer', 'floating'] def is_object(self): - return self.dtype == np.object_ + return is_object_dtype(self.dtype) + + def is_categorical(self): + return self.inferred_type in ['categorical'] def is_mixed(self): return 'mixed' in self.inferred_type @@ -772,14 +757,11 @@ def is_int(v): return indexer - def _convert_list_indexer(self, key, kind=None): - """ convert a list indexer. these should be locations """ - return key - - def _convert_list_indexer_for_mixed(self, keyarr, kind=None): - """ passed a key that is tuplesafe that is integer based - and we have a mixed index (e.g. number/labels). figure out - the indexer. return None if we can't help + def _convert_list_indexer(self, keyarr, kind=None): + """ + passed a key that is tuplesafe that is integer based + and we have a mixed index (e.g. number/labels). figure out + the indexer. return None if we can't help """ if (kind is None or kind in ['iloc','ix']) and (is_integer_dtype(keyarr) and not self.is_floating()): if self.inferred_type != 'integer': @@ -954,17 +936,13 @@ def __getitem__(self, key): else: return result - def append(self, other): + def _ensure_compat_append(self, other): """ - Append a collection of Index options together - - Parameters - ---------- - other : Index or list/tuple of indices + prepare the append Returns ------- - appended : Index + list of to_concat, name of result Index """ name = self.name to_concat = [self] @@ -984,7 +962,21 @@ def append(self, other): to_concat = self._ensure_compat_concat(to_concat) to_concat = [x.values if isinstance(x, Index) else x for x in to_concat] + return to_concat, name + + def append(self, other): + """ + Append a collection of Index options together + + Parameters + ---------- + other : Index or list/tuple of indices + Returns + ------- + appended : Index + """ + to_concat, name = self._ensure_compat_append(other) return Index(np.concatenate(to_concat), name=name) @staticmethod @@ -1046,10 +1038,12 @@ def _format_with_header(self, header, na_rep='NaN', **kwargs): from pandas.core.format import format_array - if values.dtype == np.object_: + if is_categorical_dtype(values.dtype): + values = np.array(values) + elif is_object_dtype(values.dtype): values = lib.maybe_convert_objects(values, safe=1) - if values.dtype == np.object_: + if is_object_dtype(values.dtype): result = [com.pprint_thing(x, escape_chars=('\t', '\r', '\n')) for x in values] @@ -1092,9 +1086,6 @@ def equals(self, other): if not isinstance(other, Index): return False - if type(other) != Index: - return other.equals(self) - return array_equivalent(_values_from_object(self), _values_from_object(other)) def identical(self, other): @@ -1201,13 +1192,6 @@ def __sub__(self, other): "use .difference()",FutureWarning) return self.difference(other) - __eq__ = _indexOp('__eq__') - __ne__ = _indexOp('__ne__') - __lt__ = _indexOp('__lt__') - __gt__ = _indexOp('__gt__') - __le__ = _indexOp('__le__') - __ge__ = _indexOp('__ge__') - def __and__(self, other): return self.intersection(other) @@ -1240,7 +1224,7 @@ def union(self, other): self._assert_can_do_setop(other) - if self.dtype != other.dtype: + if not is_dtype_equal(self.dtype,other.dtype): this = self.astype('O') other = other.astype('O') return this.union(other) @@ -1314,7 +1298,7 @@ def intersection(self, other): if self.equals(other): return self - if self.dtype != other.dtype: + if not is_dtype_equal(self.dtype,other.dtype): this = self.astype('O') other = other.astype('O') return this.intersection(other) @@ -1473,7 +1457,7 @@ def get_value(self, series, key): raise except TypeError: # generator/iterator-like - if com.is_iterator(key): + if is_iterator(key): raise InvalidIndexError(key) else: raise e1 @@ -1548,7 +1532,7 @@ def get_indexer(self, target, method=None, limit=None): if pself is not self or ptarget is not target: return pself.get_indexer(ptarget, method=method, limit=limit) - if self.dtype != target.dtype: + if not is_dtype_equal(self.dtype,target.dtype): this = self.astype(object) target = target.astype(object) return this.get_indexer(target, method=method, limit=limit) @@ -1647,7 +1631,8 @@ def get_indexer_for(self, target, **kwargs): """ guaranteed return of an indexer even when non-unique """ if self.is_unique: return self.get_indexer(target, **kwargs) - return self.get_indexer_non_unique(target, **kwargs)[0] + indexer, _ = self.get_indexer_non_unique(target, **kwargs) + return indexer def _possibly_promote(self, other): # A hack, but it works @@ -1655,7 +1640,7 @@ def _possibly_promote(self, other): if self.inferred_type == 'date' and isinstance(other, DatetimeIndex): return DatetimeIndex(self), other elif self.inferred_type == 'boolean': - if self.dtype != 'object': + if not is_object_dtype(self.dtype): return self.astype('object'), other.astype('object') return self, other @@ -1707,12 +1692,35 @@ def isin(self, values, level=None): value_set = set(values) if level is not None: self._validate_index_level(level) - return lib.ismember(self._array_values(), value_set) + return lib.ismember(np.array(self), value_set) + + def _can_reindex(self, indexer): + """ + *this is an internal non-public method* + + Check if we are allowing reindexing with this particular indexer + + Parameters + ---------- + indexer : an integer indexer + + Raises + ------ + ValueError if its a duplicate axis + """ + + # trying to reindex on an axis with duplicates + if not self.is_unique and len(indexer): + raise ValueError("cannot reindex from a duplicate axis") def reindex(self, target, method=None, level=None, limit=None): """ Create index with target's values (move/add/delete values as necessary) + Parameters + ---------- + target : an iterable + Returns ------- new_index : pd.Index @@ -1733,6 +1741,7 @@ def reindex(self, target, method=None, level=None, limit=None): target = self._simple_new(np.empty(0, dtype=self.dtype), **attrs) else: target = _ensure_index(target) + if level is not None: if method is not None: raise TypeError('Fill method not supported if level passed') @@ -1757,9 +1766,72 @@ def reindex(self, target, method=None, level=None, limit=None): return target, indexer + def _reindex_non_unique(self, target): + """ + *this is an internal non-public method* + + Create a new index with target's values (move/add/delete values as necessary) + use with non-unique Index and a possibly non-unique target + + Parameters + ---------- + target : an iterable + + Returns + ------- + new_index : pd.Index + Resulting index + indexer : np.ndarray or None + Indices of output values in original index + + """ + + target = _ensure_index(target) + indexer, missing = self.get_indexer_non_unique(target) + check = indexer != -1 + new_labels = self.take(indexer[check]) + new_indexer = None + + if len(missing): + l = np.arange(len(indexer)) + + missing = com._ensure_platform_int(missing) + missing_labels = target.take(missing) + missing_indexer = com._ensure_int64(l[~check]) + cur_labels = self.take(indexer[check]).values + cur_indexer = com._ensure_int64(l[check]) + + new_labels = np.empty(tuple([len(indexer)]), dtype=object) + new_labels[cur_indexer] = cur_labels + new_labels[missing_indexer] = missing_labels + + # a unique indexer + if target.is_unique: + + # see GH5553, make sure we use the right indexer + new_indexer = np.arange(len(indexer)) + new_indexer[cur_indexer] = np.arange(len(cur_labels)) + new_indexer[missing_indexer] = -1 + + # we have a non_unique selector, need to use the original + # indexer here + else: + + # need to retake to have the same size as the indexer + indexer = indexer.values + indexer[~check] = 0 + + # reset the new indexer to account for the new size + new_indexer = np.arange(len(self.take(indexer))) + new_indexer[~check] = -1 + + return self._shallow_copy(new_labels), indexer, new_indexer + def join(self, other, how='left', level=None, return_indexers=False): """ - Internal API method. Compute join_index and indexers to conform data + *this is an internal non-public method* + + Compute join_index and indexers to conform data structures to the new index. Parameters @@ -1818,7 +1890,7 @@ def join(self, other, how='left', level=None, return_indexers=False): result = x, z, y return result - if self.dtype != other.dtype: + if not is_dtype_equal(self.dtype,other.dtype): this = self.astype('O') other = other.astype('O') return this.join(other, how=how, @@ -2369,6 +2441,34 @@ def _evaluate_with_timedelta_like(self, other, op, opstr): def _evaluate_with_datetime_like(self, other, op, opstr): raise TypeError("can only perform ops with datetime like values") + @classmethod + def _add_comparison_methods(cls): + """ add in comparison methods """ + + def _make_compare(op): + + def _evaluate_compare(self, other): + func = getattr(self.values, op) + result = func(np.asarray(other)) + + # technically we could support bool dtyped Index + # for now just return the indexing array directly + if is_bool_dtype(result): + return result + try: + return Index(result) + except TypeError: + return result + + return _evaluate_compare + + cls.__eq__ = _make_compare('__eq__') + cls.__ne__ = _make_compare('__ne__') + cls.__lt__ = _make_compare('__lt__') + cls.__gt__ = _make_compare('__gt__') + cls.__le__ = _make_compare('__le__') + cls.__ge__ = _make_compare('__ge__') + @classmethod def _add_numeric_methods_disabled(cls): """ add in numeric methods to disable """ @@ -2423,7 +2523,7 @@ def _evaluate_numeric_binop(self, other): elif isinstance(other, (Timestamp, np.datetime64)): return self._evaluate_with_datetime_like(other, op, opstr) else: - if not (com.is_float(other) or com.is_integer(other)): + if not (is_float(other) or is_integer(other)): raise TypeError("can only perform ops with scalar values") # if we are a reversed non-communative op @@ -2487,7 +2587,7 @@ def _make_logical_function(name, desc, f): @Appender(_doc) def logical_func(self, *args, **kwargs): result = f(self.values) - if isinstance(result, (np.ndarray, com.ABCSeries, Index)) \ + if isinstance(result, (np.ndarray, ABCSeries, Index)) \ and result.ndim == 0: # return NumPy type return result.dtype.type(result.item()) @@ -2519,6 +2619,539 @@ def invalid_op(self, other=None): Index._add_numeric_methods_disabled() Index._add_logical_methods() +Index._add_comparison_methods() + +class CategoricalIndex(Index, PandasDelegate): + """ + + Immutable Index implementing an ordered, sliceable set. CategoricalIndex + represents a sparsely populated Index with an underlying Categorical. + + Parameters + ---------- + data : array-like or Categorical, (1-dimensional) + categories : optional, array-like + categories for the CategoricalIndex + ordered : boolean, + designating if the categories are ordered + copy : bool + Make a copy of input ndarray + name : object + Name to be stored in the index + + """ + + _typ = 'categoricalindex' + _engine_type = _index.Int64Engine + _attributes = ['name','categories','ordered'] + + def __new__(cls, data=None, categories=None, ordered=None, dtype=None, copy=False, name=None, fastpath=False, **kwargs): + + if fastpath: + return cls._simple_new(data, name=name) + + if isinstance(data, ABCCategorical): + data = cls._create_categorical(cls, data, categories, ordered) + elif isinstance(data, CategoricalIndex): + data = data._data + data = cls._create_categorical(cls, data, categories, ordered) + else: + + # don't allow scalars + # if data is None, then categories must be provided + if lib.isscalar(data): + if data is not None or categories is None: + cls._scalar_data_error(data) + data = [] + data = cls._create_categorical(cls, data, categories, ordered) + + if copy: + data = data.copy() + + return cls._simple_new(data, name=name) + + def _create_from_codes(self, codes, categories=None, ordered=None, name=None): + """ + *this is an internal non-public method* + + create the correct categorical from codes + + Parameters + ---------- + codes : new codes + categories : optional categories, defaults to existing + ordered : optional ordered attribute, defaults to existing + name : optional name attribute, defaults to existing + + Returns + ------- + CategoricalIndex + """ + + from pandas.core.categorical import Categorical + if categories is None: + categories = self.categories + if ordered is None: + ordered = self.ordered + if name is None: + name = self.name + cat = Categorical.from_codes(codes, categories=categories, ordered=self.ordered) + return CategoricalIndex(cat, name=name) + + @staticmethod + def _create_categorical(self, data, categories=None, ordered=None): + """ + *this is an internal non-public method* + + create the correct categorical from data and the properties + + Parameters + ---------- + data : data for new Categorical + categories : optional categories, defaults to existing + ordered : optional ordered attribute, defaults to existing + + Returns + ------- + Categorical + """ + + if not isinstance(data, ABCCategorical): + from pandas.core.categorical import Categorical + data = Categorical(data, categories=categories, ordered=ordered) + else: + if categories is not None: + data = data.set_categories(categories) + if ordered is not None: + data = data.set_ordered(ordered) + return data + + @classmethod + def _simple_new(cls, values, name=None, categories=None, ordered=None, **kwargs): + result = object.__new__(cls) + + values = cls._create_categorical(cls, values, categories, ordered) + result._data = values + result.name = name + for k, v in compat.iteritems(kwargs): + setattr(result,k,v) + + result._reset_identity() + return result + + def _is_dtype_compat(self, other): + """ + *this is an internal non-public method* + + provide a comparison between the dtype of self and other (coercing if needed) + + Raises + ------ + TypeError if the dtypes are not compatible + """ + + if is_categorical_dtype(other): + if isinstance(other, CategoricalIndex): + other = other.values + if not other.is_dtype_equal(self): + raise TypeError("categories must match existing categories when appending") + else: + values = other + other = CategoricalIndex(self._create_categorical(self, other, categories=self.categories, ordered=self.ordered)) + if not other.isin(values).all(): + raise TypeError("cannot append a non-category item to a CategoricalIndex") + + return other + + def equals(self, other): + """ + Determines if two CategorialIndex objects contain the same elements. + """ + if self.is_(other): + return True + + try: + other = self._is_dtype_compat(other) + return array_equivalent(self._data, other) + except (TypeError, ValueError): + pass + + return False + + def __unicode__(self): + """ + Return a string representation for this object. + + Invoked by unicode(df) in py2 only. Yields a Unicode String in both + py2/py3. + """ + + # currently doesn't use the display.max_categories, or display.max_seq_len + # for head/tail printing + values = default_pprint(self.values.get_values()) + cats = default_pprint(self.categories.get_values()) + space = ' ' * (len(self.__class__.__name__) + 1) + name = self.name + if name is not None: + name = default_pprint(name) + + result = u("{klass}({values},\n{space}categories={categories},\n{space}ordered={ordered},\n{space}name={name})").format( + klass=self.__class__.__name__, + values=values, + categories=cats, + ordered=self.ordered, + name=name, + space=space) + + return result + + @property + def inferred_type(self): + return 'categorical' + + @property + def values(self): + """ return the underlying data, which is a Categorical """ + return self._data + + @property + def codes(self): + return self._data.codes + + @property + def categories(self): + return self._data.categories + + @property + def ordered(self): + return self._data.ordered + + def __contains__(self, key): + hash(key) + return key in self.values + + def __array__(self, dtype=None): + """ the array interface, return my values """ + return np.array(self._data, dtype=dtype) + + def argsort(self, *args, **kwargs): + return self.values.argsort(*args, **kwargs) + + @cache_readonly + def _engine(self): + + # we are going to look things up with the codes themselves + return self._engine_type(lambda: self.codes.astype('i8'), len(self)) + + @cache_readonly + def is_unique(self): + return not self.duplicated().any() + + @Appender(_shared_docs['duplicated'] % _index_doc_kwargs) + def duplicated(self, take_last=False): + from pandas.hashtable import duplicated_int64 + return duplicated_int64(self.codes.astype('i8'), take_last) + + def get_loc(self, key, method=None): + """ + Get integer location for requested label + + Parameters + ---------- + key : label + method : {None} + * default: exact matches only. + + Returns + ------- + loc : int if unique index, possibly slice or mask if not + """ + codes = self.categories.get_loc(key) + if (codes == -1): + raise KeyError(key) + indexer, _ = self._engine.get_indexer_non_unique(np.array([codes])) + if (indexer==-1).any(): + raise KeyError(key) + + return indexer + + def _can_reindex(self, indexer): + """ always allow reindexing """ + pass + + def reindex(self, target, method=None, level=None, limit=None): + """ + Create index with target's values (move/add/delete values as necessary) + + Returns + ------- + new_index : pd.Index + Resulting index + indexer : np.ndarray or None + Indices of output values in original index + + """ + + if method is not None: + raise NotImplementedError("argument method is not implemented for CategoricalIndex.reindex") + if level is not None: + raise NotImplementedError("argument level is not implemented for CategoricalIndex.reindex") + if limit is not None: + raise NotImplementedError("argument limit is not implemented for CategoricalIndex.reindex") + + target = _ensure_index(target) + + if not is_categorical_dtype(target) and not target.is_unique: + raise ValueError("cannot reindex with a non-unique indexer") + + indexer, missing = self.get_indexer_non_unique(np.array(target)) + new_target = self.take(indexer) + + + # filling in missing if needed + if len(missing): + cats = self.categories.get_indexer(target) + if (cats==-1).any(): + + # coerce to a regular index here! + result = Index(np.array(self),name=self.name) + new_target, indexer, _ = result._reindex_non_unique(np.array(target)) + + else: + + codes = new_target.codes.copy() + codes[indexer==-1] = cats[missing] + new_target = self._create_from_codes(codes) + + # we always want to return an Index type here + # to be consistent with .reindex for other index types (e.g. they don't coerce + # based on the actual values, only on the dtype) + # unless we had an inital Categorical to begin with + # in which case we are going to conform to the passed Categorical + new_target = np.asarray(new_target) + if is_categorical_dtype(target): + new_target = target._shallow_copy(new_target, name=self.name) + else: + new_target = Index(new_target, name=self.name) + + return new_target, indexer + + def _reindex_non_unique(self, target): + """ reindex from a non-unique; which CategoricalIndex's are almost always """ + new_target, indexer = self.reindex(target) + new_indexer = None + + check = indexer==-1 + if check.any(): + new_indexer = np.arange(len(self.take(indexer))) + new_indexer[check] = -1 + + return new_target, indexer, new_indexer + + def get_indexer(self, target, method=None, limit=None): + """ + Compute indexer and mask for new index given the current index. The + indexer should be then used as an input to ndarray.take to align the + current data to the new index. The mask determines whether labels are + found or not in the current index + + Parameters + ---------- + target : MultiIndex or Index (of tuples) + method : {'pad', 'ffill', 'backfill', 'bfill'} + pad / ffill: propagate LAST valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap + + Notes + ----- + This is a low-level method and probably should be used at your own risk + + Examples + -------- + >>> indexer, mask = index.get_indexer(new_index) + >>> new_values = cur_values.take(indexer) + >>> new_values[-mask] = np.nan + + Returns + ------- + (indexer, mask) : (ndarray, ndarray) + """ + method = com._clean_reindex_fill_method(method) + target = _ensure_index(target) + + if isinstance(target, CategoricalIndex): + target = target.categories + + if method == 'pad' or method == 'backfill': + raise NotImplementedError("method='pad' and method='backfill' not implemented yet " + 'for CategoricalIndex') + elif method == 'nearest': + raise NotImplementedError("method='nearest' not implemented yet " + 'for CategoricalIndex') + else: + + codes = self.categories.get_indexer(target) + indexer, _ = self._engine.get_indexer_non_unique(codes) + + return com._ensure_platform_int(indexer) + + def get_indexer_non_unique(self, target): + """ this is the same for a CategoricalIndex for get_indexer; the API returns the missing values as well """ + target = _ensure_index(target) + + if isinstance(target, CategoricalIndex): + target = target.categories + + codes = self.categories.get_indexer(target) + return self._engine.get_indexer_non_unique(codes) + + def _convert_list_indexer(self, keyarr, kind=None): + """ + we are passed a list indexer. + Return our indexer or raise if all of the values are not included in the categories + """ + codes = self.categories.get_indexer(keyarr) + if (codes==-1).any(): + raise KeyError("a list-indexer must only include values that are in the categories") + + return None + + def take(self, indexer, axis=0): + """ + return a new CategoricalIndex of the values selected by the indexer + + See also + -------- + numpy.ndarray.take + """ + + indexer = com._ensure_platform_int(indexer) + taken = self.codes.take(indexer) + return self._create_from_codes(taken) + + def delete(self, loc): + """ + Make new Index with passed location(-s) deleted + + Returns + ------- + new_index : Index + """ + return self._create_from_codes(np.delete(self.codes, loc)) + + def insert(self, loc, item): + """ + Make new Index inserting new item at location. Follows + Python list.append semantics for negative values + + Parameters + ---------- + loc : int + item : object + + Returns + ------- + new_index : Index + + Raises + ------ + ValueError if the item is not in the categories + + """ + code = self.categories.get_indexer([item]) + if (code == -1): + raise TypeError("cannot insert an item into a CategoricalIndex that is not already an existing category") + + codes = self.codes + codes = np.concatenate( + (codes[:loc], code, codes[loc:])) + return self._create_from_codes(codes) + + def append(self, other): + """ + Append a collection of CategoricalIndex options together + + Parameters + ---------- + other : Index or list/tuple of indices + + Returns + ------- + appended : Index + + Raises + ------ + ValueError if other is not in the categories + """ + to_concat, name = self._ensure_compat_append(other) + to_concat = [ self._is_dtype_compat(c) for c in to_concat ] + codes = np.concatenate([ c.codes for c in to_concat ]) + return self._create_from_codes(codes, name=name) + + @classmethod + def _add_comparison_methods(cls): + """ add in comparison methods """ + + def _make_compare(op): + + def _evaluate_compare(self, other): + + # if we have a Categorical type, then must have the same categories + if isinstance(other, CategoricalIndex): + other = other.values + elif isinstance(other, Index): + other = self._create_categorical(self, other.values, categories=self.categories, ordered=self.ordered) + + if isinstance(other, ABCCategorical): + if not (self.values.is_dtype_equal(other) and len(self.values) == len(other)): + raise TypeError("categorical index comparisions must have the same categories and ordered attributes") + + return getattr(self.values, op)(other) + + return _evaluate_compare + + cls.__eq__ = _make_compare('__eq__') + cls.__ne__ = _make_compare('__ne__') + cls.__lt__ = _make_compare('__lt__') + cls.__gt__ = _make_compare('__gt__') + cls.__le__ = _make_compare('__le__') + cls.__ge__ = _make_compare('__ge__') + + + def _delegate_method(self, name, *args, **kwargs): + """ method delegation to the .values """ + method = getattr(self.values, name) + if 'inplace' in kwargs: + raise ValueError("cannot use inplace with CategoricalIndex") + res = method(*args, **kwargs) + if lib.isscalar(res): + return res + return CategoricalIndex(res, name=self.name) + + @classmethod + def _add_accessors(cls): + """ add in Categorical accessor methods """ + + from pandas.core.categorical import Categorical + CategoricalIndex._add_delegate_accessors(delegate=Categorical, + accessors=["rename_categories", + "reorder_categories", + "add_categories", + "remove_categories", + "remove_unused_categories", + "set_categories", + "as_ordered", + "as_unordered", + "min", + "max"], + typ='method', + overwrite=True) + + +CategoricalIndex._add_numeric_methods_disabled() +CategoricalIndex._add_logical_methods_disabled() +CategoricalIndex._add_comparison_methods() +CategoricalIndex._add_accessors() class NumericIndex(Index): @@ -2791,7 +3424,7 @@ def equals(self, other): try: if not isinstance(other, Float64Index): other = self._constructor(other) - if self.dtype != other.dtype or self.shape != other.shape: + if not is_dtype_equal(self.dtype,other.dtype) or self.shape != other.shape: return False left, right = self.values, other.values return ((left == right) | (self._isnan & other._isnan)).all() @@ -2857,7 +3490,7 @@ def isin(self, values, level=None): value_set = set(values) if level is not None: self._validate_index_level(level) - return lib.ismember_nans(self._array_values(), value_set, + return lib.ismember_nans(np.array(self), value_set, isnull(list(value_set)).any()) @@ -3197,7 +3830,7 @@ def copy(self, names=None, dtype=None, levels=None, labels=None, verify_integrity=False, _set_identity=_set_identity) - def __array__(self, result=None): + def __array__(self, dtype=None): """ the array interface, return my values """ return self.values @@ -3209,10 +3842,6 @@ def view(self, cls=None): _shallow_copy = view - def _array_values(self): - # hack for various methods - return self.values - @cache_readonly def dtype(self): return np.dtype('O') @@ -3359,7 +3988,7 @@ def values(self): taken = com.take_1d(lev._box_values(lev.values), lab, fill_value=_get_na_value(lev.dtype.type)) else: - taken = com.take_1d(lev.values, lab) + taken = com.take_1d(np.asarray(lev.values), lab) values.append(taken) self._tuples = lib.fast_zip(values) @@ -3424,7 +4053,7 @@ def _try_mi(k): raise except TypeError: # generator/iterator-like - if com.is_iterator(key): + if is_iterator(key): raise InvalidIndexError(key) else: raise e1 @@ -4095,7 +4724,7 @@ def get_indexer(self, target, method=None, limit=None): if isinstance(target, MultiIndex): target_index = target._tuple_index - if target_index.dtype != object: + if not is_object_dtype(target_index.dtype): return np.ones(len(target_index)) * -1 if not self.is_unique: @@ -4654,9 +5283,9 @@ def equals(self, other): return False for i in range(self.nlevels): - svalues = com.take_nd(self.levels[i].values, self.labels[i], + svalues = com.take_nd(np.asarray(self.levels[i].values), self.labels[i], allow_fill=False) - ovalues = com.take_nd(other.levels[i].values, other.labels[i], + ovalues = com.take_nd(np.asarray(other.levels[i].values), other.labels[i], allow_fill=False) if not array_equivalent(svalues, ovalues): return False @@ -4772,7 +5401,7 @@ def _assert_can_do_setop(self, other): pass def astype(self, dtype): - if np.dtype(dtype) != np.object_: + if not is_object_dtype(np.dtype(dtype)): raise TypeError('Setting %s dtype to anything other than object ' 'is not supported' % self.__class__) return self._shallow_copy() @@ -4852,7 +5481,7 @@ def _wrap_joined_index(self, joined, other): @Appender(Index.isin.__doc__) def isin(self, values, level=None): if level is None: - return lib.ismember(self._array_values(), set(values)) + return lib.ismember(np.array(self), set(values)) else: num = self._get_level_number(level) levs = self.levels[num] diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 8154eb1bb6c8b..41950bf8b0e88 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -253,7 +253,7 @@ def _setitem_with_indexer(self, indexer, value): # just replacing the block manager here # so the object is the same index = self.obj._get_axis(i) - labels = safe_append_to_index(index, key) + labels = index.insert(len(index),key) self.obj._data = self.obj.reindex_axis(labels, i)._data self.obj._maybe_update_cacher(clear=True) self.obj.is_copy=None @@ -274,10 +274,7 @@ def _setitem_with_indexer(self, indexer, value): # and set inplace if self.ndim == 1: index = self.obj.index - if len(index) == 0: - new_index = Index([indexer]) - else: - new_index = safe_append_to_index(index, indexer) + new_index = index.insert(len(index),indexer) # this preserves dtype of the value new_values = Series([value]).values @@ -928,24 +925,6 @@ def _getitem_iterable(self, key, axis=0): labels = self.obj._get_axis(axis) - def _reindex(keys, level=None): - - try: - result = self.obj.reindex_axis(keys, axis=axis, level=level) - except AttributeError: - # Series - if axis != 0: - raise AssertionError('axis must be 0') - return self.obj.reindex(keys, level=level) - - # this is an error as we are trying to find - # keys in a multi-index that don't exist - if isinstance(labels, MultiIndex) and level is not None: - if hasattr(result,'ndim') and not np.prod(result.shape) and len(keys): - raise KeyError("cannot index a multi-index axis with these keys") - - return result - if is_bool_indexer(key): key = check_bool_indexer(labels, key) inds, = key.nonzero() @@ -958,8 +937,9 @@ def _reindex(keys, level=None): # asarray can be unsafe, NumPy strings are weird keyarr = _asarray_tuplesafe(key) - # handle a mixed integer scenario - indexer = labels._convert_list_indexer_for_mixed(keyarr, kind=self.name) + # have the index handle the indexer and possibly return + # an indexer or raising + indexer = labels._convert_list_indexer(keyarr, kind=self.name) if indexer is not None: return self.obj.take(indexer, axis=axis) @@ -970,65 +950,48 @@ def _reindex(keys, level=None): else: level = None - keyarr_is_unique = Index(keyarr).is_unique + # existing labels are unique and indexer are unique + if labels.is_unique and Index(keyarr).is_unique: + + try: + result = self.obj.reindex_axis(keyarr, axis=axis, level=level) + + # this is an error as we are trying to find + # keys in a multi-index that don't exist + if isinstance(labels, MultiIndex) and level is not None: + if hasattr(result,'ndim') and not np.prod(result.shape) and len(keyarr): + raise KeyError("cannot index a multi-index axis with these keys") + + return result - # existing labels are unique and indexer is unique - if labels.is_unique and keyarr_is_unique: - return _reindex(keyarr, level=level) + except AttributeError: + # Series + if axis != 0: + raise AssertionError('axis must be 0') + return self.obj.reindex(keyarr, level=level) + + # existing labels are non-unique else: - indexer, missing = labels.get_indexer_non_unique(keyarr) - check = indexer != -1 - result = self.obj.take(indexer[check], axis=axis, - convert=False) - - # need to merge the result labels and the missing labels - if len(missing): - l = np.arange(len(indexer)) - - missing = com._ensure_platform_int(missing) - missing_labels = keyarr.take(missing) - missing_indexer = com._ensure_int64(l[~check]) - cur_labels = result._get_axis(axis).values - cur_indexer = com._ensure_int64(l[check]) - - new_labels = np.empty(tuple([len(indexer)]), dtype=object) - new_labels[cur_indexer] = cur_labels - new_labels[missing_indexer] = missing_labels - - # reindex with the specified axis - ndim = self.obj.ndim - if axis + 1 > ndim: - raise AssertionError("invalid indexing error with " - "non-unique index") - - # a unique indexer - if keyarr_is_unique: - - # see GH5553, make sure we use the right indexer - new_indexer = np.arange(len(indexer)) - new_indexer[cur_indexer] = np.arange( - len(result._get_axis(axis)) - ) - new_indexer[missing_indexer] = -1 - # we have a non_unique selector, need to use the original - # indexer here - else: + # reindex with the specified axis + if axis + 1 > self.obj.ndim: + raise AssertionError("invalid indexing error with " + "non-unique index") - # need to retake to have the same size as the indexer - rindexer = indexer.values - rindexer[~check] = 0 - result = self.obj.take(rindexer, axis=axis, - convert=False) + new_target, indexer, new_indexer = labels._reindex_non_unique(keyarr) - # reset the new indexer to account for the new size - new_indexer = np.arange(len(result)) - new_indexer[~check] = -1 + if new_indexer is not None: + result = self.obj.take(indexer[indexer!=-1], axis=axis, + convert=False) result = result._reindex_with_indexers({ - axis: [new_labels, new_indexer] - }, copy=True, allow_dups=True) + axis: [new_target, new_indexer] + }, copy=True, allow_dups=True) + + else: + result = self.obj.take(indexer, axis=axis, + convert=False) return result @@ -1105,8 +1068,9 @@ def _convert_to_indexer(self, obj, axis=0, is_setter=False): else: objarr = _asarray_tuplesafe(obj) - # If have integer labels, defer to label-based indexing - indexer = labels._convert_list_indexer_for_mixed(objarr, kind=self.name) + # The index may want to handle a list indexer differently + # by returning an indexer or raising + indexer = labels._convert_list_indexer(objarr, kind=self.name) if indexer is not None: return indexer @@ -1719,19 +1683,6 @@ def get_indexer(_i, _idx): return tuple([get_indexer(_i, _idx) for _i, _idx in enumerate(indexer)]) -def safe_append_to_index(index, key): - """ a safe append to an index, if incorrect type, then catch and recreate - """ - try: - return index.insert(len(index), key) - except: - - # raise here as this is basically an unsafe operation and we want - # it to be obvious that you are doing something wrong - raise ValueError("unsafe appending to index of type {0} with a key " - "{1}".format(index.__class__.__name__, key)) - - def maybe_convert_indices(indices, n): """ if we have negative indicies, translate to postive here if have indicies that are out-of-bounds, raise an IndexError diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 864dc0dd46de2..440892f8e8b59 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -3134,7 +3134,6 @@ def reindex_indexer(self, new_axis, indexer, axis, fill_value=None, pandas-indexer with -1's only. """ - if indexer is None: if new_axis is self.axes[axis] and not copy: return self @@ -3146,10 +3145,9 @@ def reindex_indexer(self, new_axis, indexer, axis, fill_value=None, self._consolidate_inplace() - # trying to reindex on an axis with duplicates - if (not allow_dups and not self.axes[axis].is_unique - and len(indexer)): - raise ValueError("cannot reindex from a duplicate axis") + # some axes don't allow reindexing with dups + if not allow_dups: + self.axes[axis]._can_reindex(indexer) if axis >= self.ndim: raise IndexError("Requested axis not found in manager") diff --git a/pandas/core/series.py b/pandas/core/series.py index 7bcf6c6671152..685d44acafe53 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2594,8 +2594,9 @@ def _try_cast(arr, take_fast_path): # GH #846 if isinstance(data, (np.ndarray, Index, Series)): - subarr = np.array(data, copy=False) + if dtype is not None: + subarr = np.array(data, copy=False) # possibility of nan -> garbage if com.is_float_dtype(data.dtype) and com.is_integer_dtype(dtype): diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index af48774492b11..97fa442595893 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -11,7 +11,7 @@ import numpy as np import pandas as pd -from pandas import Categorical, Index, Series, DataFrame, PeriodIndex, Timestamp +from pandas import Categorical, Index, Series, DataFrame, PeriodIndex, Timestamp, CategoricalIndex from pandas.core.config import option_context import pandas.core.common as com @@ -93,6 +93,24 @@ def test_constructor_unsortable(self): else: Categorical.from_array(arr, ordered=True) + def test_is_equal_dtype(self): + + # test dtype comparisons between cats + + c1 = Categorical(list('aabca'),categories=list('abc'),ordered=False) + c2 = Categorical(list('aabca'),categories=list('cab'),ordered=False) + c3 = Categorical(list('aabca'),categories=list('cab'),ordered=True) + self.assertTrue(c1.is_dtype_equal(c1)) + self.assertTrue(c2.is_dtype_equal(c2)) + self.assertTrue(c3.is_dtype_equal(c3)) + self.assertFalse(c1.is_dtype_equal(c2)) + self.assertFalse(c1.is_dtype_equal(c3)) + self.assertFalse(c1.is_dtype_equal(Index(list('aabca')))) + self.assertFalse(c1.is_dtype_equal(c1.astype(object))) + self.assertTrue(c1.is_dtype_equal(CategoricalIndex(c1))) + self.assertFalse(c1.is_dtype_equal(CategoricalIndex(c1,categories=list('cab')))) + self.assertFalse(c1.is_dtype_equal(CategoricalIndex(c1,ordered=True))) + def test_constructor(self): exp_arr = np.array(["a", "b", "c", "a", "b", "c"]) @@ -224,6 +242,18 @@ def f(): c_old2 = Categorical([0, 1, 2, 0, 1, 2], [1, 2, 3]) cat = Categorical([1,2], categories=[1,2,3]) + # this is a legitimate constructor + with tm.assert_produces_warning(None): + c = Categorical(np.array([],dtype='int64'),categories=[3,2,1],ordered=True) + + def test_constructor_with_index(self): + + ci = CategoricalIndex(list('aabbca'),categories=list('cab')) + self.assertTrue(ci.values.equals(Categorical(ci))) + + ci = CategoricalIndex(list('aabbca'),categories=list('cab')) + self.assertTrue(ci.values.equals(Categorical(ci.astype(object),categories=ci.categories))) + def test_constructor_with_generator(self): # This was raising an Error in isnull(single_val).any() because isnull returned a scalar # for a generator @@ -2562,6 +2592,8 @@ def f(): dfx['grade'].cat.categories self.assert_numpy_array_equal(df['grade'].cat.categories, dfx['grade'].cat.categories) + def test_concat_preserve(self): + # GH 8641 # series concat not preserving category dtype s = Series(list('abc'),dtype='category') @@ -2579,6 +2611,28 @@ def f(): expected = Series(list('abcabc'),index=[0,1,2,0,1,2]).astype('category') tm.assert_series_equal(result, expected) + a = Series(np.arange(6,dtype='int64')) + b = Series(list('aabbca')) + + df2 = DataFrame({'A' : a, 'B' : b.astype('category',categories=list('cab')) }) + result = pd.concat([df2,df2]) + expected = DataFrame({'A' : pd.concat([a,a]), 'B' : pd.concat([b,b]).astype('category',categories=list('cab')) }) + tm.assert_frame_equal(result, expected) + + def test_categorical_index_preserver(self): + + a = Series(np.arange(6,dtype='int64')) + b = Series(list('aabbca')) + + df2 = DataFrame({'A' : a, 'B' : b.astype('category',categories=list('cab')) }).set_index('B') + result = pd.concat([df2,df2]) + expected = DataFrame({'A' : pd.concat([a,a]), 'B' : pd.concat([b,b]).astype('category',categories=list('cab')) }).set_index('B') + tm.assert_frame_equal(result, expected) + + # wrong catgories + df3 = DataFrame({'A' : a, 'B' : b.astype('category',categories=list('abc')) }).set_index('B') + self.assertRaises(TypeError, lambda : pd.concat([df2,df3])) + def test_append(self): cat = pd.Categorical(["a","b"], categories=["a","b"]) vals = [1,2] @@ -2714,6 +2768,14 @@ def cmp(a,b): self.assertRaises(TypeError, lambda : invalid(s)) + def test_astype_categorical(self): + + cat = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) + tm.assert_categorical_equal(cat,cat.astype('category')) + tm.assert_almost_equal(np.array(cat),cat.astype('object')) + + self.assertRaises(ValueError, lambda : cat.astype(float)) + def test_to_records(self): # GH8626 diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index a35e03d53cb31..5912ccb1494fe 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -33,7 +33,7 @@ import pandas.core.datetools as datetools from pandas import (DataFrame, Index, Series, Panel, notnull, isnull, MultiIndex, DatetimeIndex, Timestamp, date_range, - read_csv, timedelta_range, Timedelta, + read_csv, timedelta_range, Timedelta, CategoricalIndex, option_context) import pandas as pd from pandas.parser import CParserError @@ -2386,6 +2386,32 @@ def test_set_index_pass_arrays(self): expected = df.set_index(['A', 'B'], drop=False) assert_frame_equal(result, expected, check_names=False) # TODO should set_index check_names ? + def test_construction_with_categorical_index(self): + + ci = tm.makeCategoricalIndex(10) + + # with Categorical + df = DataFrame({'A' : np.random.randn(10), + 'B' : ci.values }) + idf = df.set_index('B') + str(idf) + tm.assert_index_equal(idf.index,ci) + + # from a CategoricalIndex + df = DataFrame({'A' : np.random.randn(10), + 'B' : ci }) + idf = df.set_index('B') + str(idf) + tm.assert_index_equal(idf.index,ci) + + idf = df.set_index('B').reset_index().set_index('B') + str(idf) + tm.assert_index_equal(idf.index,ci) + + new_df = idf.reset_index() + new_df.index = df.B + tm.assert_index_equal(new_df.index,ci) + def test_set_index_cast_datetimeindex(self): df = DataFrame({'A': [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)], @@ -10744,6 +10770,19 @@ def test_sort_index(self): with assertRaisesRegexp(ValueError, msg): frame.sort_index(by=['A', 'B'], axis=0, ascending=[True] * 5) + def test_sort_index_categorical_index(self): + + df = DataFrame({'A' : np.arange(6,dtype='int64'), + 'B' : Series(list('aabbca')).astype('category',categories=list('cab')) }).set_index('B') + + result = df.sort_index() + expected = df.iloc[[4,0,1,5,2,3]] + assert_frame_equal(result, expected) + + result = df.sort_index(ascending=False) + expected = df.iloc[[3,2,5,1,0,4]] + assert_frame_equal(result, expected) + def test_sort_nan(self): # GH3917 nan = np.nan diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 87536b9bf0ff8..c5a338520df21 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -8,7 +8,7 @@ from numpy import nan from pandas import date_range,bdate_range, Timestamp -from pandas.core.index import Index, MultiIndex, Int64Index +from pandas.core.index import Index, MultiIndex, Int64Index, CategoricalIndex from pandas.core.api import Categorical, DataFrame from pandas.core.groupby import (SpecificationError, DataError, _nargsort, _lexsort_indexer) @@ -3378,12 +3378,11 @@ def test_groupby_datetime_categorical(self): cats = Categorical.from_codes(codes, levels, name='myfactor', ordered=True) data = DataFrame(np.random.randn(100, 4)) - result = data.groupby(cats).mean() expected = data.groupby(np.asarray(cats)).mean() expected = expected.reindex(levels) - expected.index.name = 'myfactor' + expected.index = CategoricalIndex(expected.index,categories=expected.index,name='myfactor',ordered=True) assert_frame_equal(result, expected) self.assertEqual(result.index.name, cats.name) @@ -3398,6 +3397,26 @@ def test_groupby_datetime_categorical(self): expected.index.names = ['myfactor', None] assert_frame_equal(desc_result, expected) + def test_groupby_categorical_index(self): + + levels = ['foo', 'bar', 'baz', 'qux'] + codes = np.random.randint(0, 4, size=20) + cats = Categorical.from_codes(codes, levels, name='myfactor', ordered=True) + df = DataFrame(np.repeat(np.arange(20),4).reshape(-1,4), columns=list('abcd')) + df['cats'] = cats + + # with a cat index + result = df.set_index('cats').groupby(level=0).sum() + expected = df[list('abcd')].groupby(cats.codes).sum() + expected.index = CategoricalIndex(Categorical.from_codes([0,1,2,3], levels, ordered=True),name='cats') + assert_frame_equal(result, expected) + + # with a cat column, should produce a cat index + result = df.groupby('cats').sum() + expected = df[list('abcd')].groupby(cats.codes).sum() + expected.index = CategoricalIndex(Categorical.from_codes([0,1,2,3], levels, ordered=True),name='cats') + assert_frame_equal(result, expected) + def test_groupby_groups_datetimeindex(self): # #1430 from pandas.tseries.api import DatetimeIndex @@ -3526,6 +3545,8 @@ def test_groupby_categorical_no_compress(self): result = data.groupby(cats).mean() exp = data.groupby(codes).mean() + + exp.index = CategoricalIndex(exp.index,categories=cats.categories,ordered=cats.ordered) assert_series_equal(result, exp) codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3]) @@ -3533,6 +3554,7 @@ def test_groupby_categorical_no_compress(self): result = data.groupby(cats).mean() exp = data.groupby(codes).mean().reindex(cats.categories) + exp.index = CategoricalIndex(exp.index,categories=cats.categories,ordered=cats.ordered) assert_series_equal(result, exp) cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 336340dd95991..1d59d1f3fbfe3 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -12,14 +12,10 @@ import numpy as np from numpy.testing import assert_array_equal -from pandas import period_range, date_range - -from pandas.core.index import (Index, Float64Index, Int64Index, MultiIndex, - InvalidIndexError, NumericIndex) -from pandas.tseries.index import DatetimeIndex -from pandas.tseries.tdi import TimedeltaIndex -from pandas.tseries.period import PeriodIndex -from pandas.core.series import Series +from pandas import (period_range, date_range, Categorical, Series, + Index, Float64Index, Int64Index, MultiIndex, + CategoricalIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex) +from pandas.core.index import InvalidIndexError, NumericIndex from pandas.util.testing import (assert_almost_equal, assertRaisesRegexp, assert_copy) from pandas import compat @@ -41,6 +37,11 @@ class Base(object): _holder = None _compat_props = ['shape', 'ndim', 'size', 'itemsize', 'nbytes'] + def setup_indices(self): + # setup the test indices in the self.indicies dict + for name, ind in self.indices.items(): + setattr(self, name, ind) + def verify_pickle(self,index): unpickled = self.round_trip_pickle(index) self.assertTrue(index.equals(unpickled)) @@ -98,6 +99,7 @@ def f(): def test_reindex_base(self): idx = self.create_index() expected = np.arange(idx.size) + actual = idx.get_indexer(idx) assert_array_equal(expected, actual) @@ -118,29 +120,6 @@ def test_ndarray_compat_properties(self): idx.nbytes idx.values.nbytes - -class TestIndex(Base, tm.TestCase): - _holder = Index - _multiprocess_can_split_ = True - - def setUp(self): - self.indices = dict( - unicodeIndex = tm.makeUnicodeIndex(100), - strIndex = tm.makeStringIndex(100), - dateIndex = tm.makeDateIndex(100), - intIndex = tm.makeIntIndex(100), - floatIndex = tm.makeFloatIndex(100), - boolIndex = Index([True,False]), - empty = Index([]), - tuples = MultiIndex.from_tuples(lzip(['foo', 'bar', 'baz'], - [1, 2, 3])) - ) - for name, ind in self.indices.items(): - setattr(self, name, ind) - - def create_index(self): - return Index(list('abcde')) - def test_wrong_number_names(self): def testit(ind): ind.names = ["apple", "banana", "carrot"] @@ -150,14 +129,18 @@ def testit(ind): def test_set_name_methods(self): new_name = "This is the new name for this index" - indices = (self.dateIndex, self.intIndex, self.unicodeIndex, - self.empty) - for ind in indices: + for ind in self.indices.values(): + + # don't tests a MultiIndex here (as its tested separated) + if isinstance(ind, MultiIndex): + continue + original_name = ind.name new_ind = ind.set_names([new_name]) self.assertEqual(new_ind.name, new_name) self.assertEqual(ind.name, original_name) res = ind.rename(new_name, inplace=True) + # should return None self.assertIsNone(res) self.assertEqual(ind.name, new_name) @@ -167,46 +150,128 @@ def test_set_name_methods(self): # ind.set_names("a") with assertRaisesRegexp(ValueError, "Level must be None"): ind.set_names("a", level=0) - # rename in place just leaves tuples and other containers alone - name = ('A', 'B') - ind = self.intIndex - ind.rename(name, inplace=True) - self.assertEqual(ind.name, name) - self.assertEqual(ind.names, [name]) - def test_hash_error(self): - with tm.assertRaisesRegexp(TypeError, - "unhashable type: %r" % - type(self.strIndex).__name__): - hash(self.strIndex) + # rename in place just leaves tuples and other containers alone + name = ('A', 'B') + ind.rename(name, inplace=True) + self.assertEqual(ind.name, name) + self.assertEqual(ind.names, [name]) - def test_new_axis(self): - new_index = self.dateIndex[None, :] - self.assertEqual(new_index.ndim, 2) - tm.assert_isinstance(new_index, np.ndarray) + def test_hash_error(self): + for ind in self.indices.values(): + with tm.assertRaisesRegexp(TypeError, + "unhashable type: %r" % + type(ind).__name__): + hash(ind) def test_copy_and_deepcopy(self): from copy import copy, deepcopy - for func in (copy, deepcopy): - idx_copy = func(self.strIndex) - self.assertIsNot(idx_copy, self.strIndex) - self.assertTrue(idx_copy.equals(self.strIndex)) + for ind in self.indices.values(): - new_copy = self.strIndex.copy(deep=True, name="banana") - self.assertEqual(new_copy.name, "banana") - new_copy2 = self.intIndex.copy(dtype=int) - self.assertEqual(new_copy2.dtype.kind, 'i') + # don't tests a MultiIndex here (as its tested separated) + if isinstance(ind, MultiIndex): + continue + + for func in (copy, deepcopy): + idx_copy = func(ind) + self.assertIsNot(idx_copy, ind) + self.assertTrue(idx_copy.equals(ind)) + + new_copy = ind.copy(deep=True, name="banana") + self.assertEqual(new_copy.name, "banana") def test_duplicates(self): - idx = Index([0, 0, 0]) - self.assertFalse(idx.is_unique) + for ind in self.indices.values(): + + if not len(ind): + continue + idx = self._holder([ind[0]]*5) + self.assertFalse(idx.is_unique) + self.assertTrue(idx.has_duplicates) def test_sort(self): - self.assertRaises(TypeError, self.strIndex.sort) + for ind in self.indices.values(): + self.assertRaises(TypeError, ind.sort) def test_mutability(self): - self.assertRaises(TypeError, self.strIndex.__setitem__, 0, 'foo') + for ind in self.indices.values(): + if not len(ind): + continue + self.assertRaises(TypeError, ind.__setitem__, 0, ind[0]) + + def test_view(self): + for ind in self.indices.values(): + i_view = ind.view() + self.assertEqual(i_view.name, ind.name) + + def test_compat(self): + for ind in self.indices.values(): + self.assertEqual(ind.tolist(),list(ind)) + + def test_argsort(self): + for k, ind in self.indices.items(): + + # sep teststed + if k in ['catIndex']: + continue + + result = ind.argsort() + expected = np.array(ind).argsort() + self.assert_numpy_array_equal(result, expected) + + def test_pickle(self): + for ind in self.indices.values(): + self.verify_pickle(ind) + ind.name = 'foo' + self.verify_pickle(ind) + + def test_take(self): + indexer = [4, 3, 0, 2] + for k, ind in self.indices.items(): + + # separate + if k in ['boolIndex','tuples','empty']: + continue + + result = ind.take(indexer) + expected = ind[indexer] + self.assertTrue(result.equals(expected)) + +class TestIndex(Base, tm.TestCase): + _holder = Index + _multiprocess_can_split_ = True + + def setUp(self): + self.indices = dict( + unicodeIndex = tm.makeUnicodeIndex(100), + strIndex = tm.makeStringIndex(100), + dateIndex = tm.makeDateIndex(100), + periodIndex = tm.makePeriodIndex(100), + tdIndex = tm.makeTimedeltaIndex(100), + intIndex = tm.makeIntIndex(100), + floatIndex = tm.makeFloatIndex(100), + boolIndex = Index([True,False]), + catIndex = tm.makeCategoricalIndex(100), + empty = Index([]), + tuples = MultiIndex.from_tuples(lzip(['foo', 'bar', 'baz'], + [1, 2, 3])) + ) + self.setup_indices() + + def create_index(self): + return Index(list('abcde')) + + def test_new_axis(self): + new_index = self.dateIndex[None, :] + self.assertEqual(new_index.ndim, 2) + tm.assert_isinstance(new_index, np.ndarray) + + def test_copy_and_deepcopy(self): + super(TestIndex, self).test_copy_and_deepcopy() + + new_copy2 = self.intIndex.copy(dtype=int) + self.assertEqual(new_copy2.dtype.kind, 'i') def test_constructor(self): # regular instance creation @@ -297,18 +362,22 @@ def test_constructor_simple_new(self): result = idx._simple_new(idx, 'obj') self.assertTrue(result.equals(idx)) - def test_copy(self): - i = Index([], name='Foo') - i_copy = i.copy() - self.assertEqual(i_copy.name, 'Foo') + def test_view_with_args(self): - def test_view(self): - i = Index([], name='Foo') - i_view = i.view() - self.assertEqual(i_view.name, 'Foo') + restricted = ['unicodeIndex','strIndex','catIndex','boolIndex','empty'] + + for i in restricted: + ind = self.indices[i] - # with arguments - self.assertRaises(TypeError, lambda : i.view('i8')) + # with arguments + self.assertRaises(TypeError, lambda : ind.view('i8')) + + # these are ok + for i in list(set(self.indices.keys())-set(restricted)): + ind = self.indices[i] + + # with arguments + ind.view('i8') def test_legacy_pickle_identity(self): @@ -330,9 +399,6 @@ def test_astype(self): casted = self.intIndex.astype('i8') self.assertEqual(casted.name, 'foobar') - def test_compat(self): - self.strIndex.tolist() - def test_equals(self): # same self.assertTrue(Index(['a', 'b', 'c']).equals(Index(['a', 'b', 'c']))) @@ -459,11 +525,6 @@ def test_nanosecond_index_access(self): self.assertEqual(first_value, x[Timestamp(np.datetime64('2013-01-01 00:00:00.000000050+0000', 'ns'))]) - def test_argsort(self): - result = self.strIndex.argsort() - expected = np.array(self.strIndex).argsort() - self.assert_numpy_array_equal(result, expected) - def test_comparators(self): index = self.dateIndex element = index[len(index) // 2] @@ -760,22 +821,17 @@ def test_symmetric_diff(self): with tm.assertRaises(TypeError): Index(idx1,dtype='object') - 1 - def test_pickle(self): - - self.verify_pickle(self.strIndex) - self.strIndex.name = 'foo' - self.verify_pickle(self.strIndex) - self.verify_pickle(self.dateIndex) - def test_is_numeric(self): self.assertFalse(self.dateIndex.is_numeric()) self.assertFalse(self.strIndex.is_numeric()) self.assertTrue(self.intIndex.is_numeric()) self.assertTrue(self.floatIndex.is_numeric()) + self.assertFalse(self.catIndex.is_numeric()) def test_is_object(self): self.assertTrue(self.strIndex.is_object()) self.assertTrue(self.boolIndex.is_object()) + self.assertFalse(self.catIndex.is_object()) self.assertFalse(self.intIndex.is_object()) self.assertFalse(self.dateIndex.is_object()) self.assertFalse(self.floatIndex.is_object()) @@ -839,12 +895,6 @@ def test_format_none(self): idx.format() self.assertIsNone(idx[3]) - def test_take(self): - indexer = [4, 3, 0, 2] - result = self.dateIndex.take(indexer) - expected = self.dateIndex[indexer] - self.assertTrue(result.equals(expected)) - def test_logical_compat(self): idx = self.create_index() self.assertEqual(idx.all(), idx.values.all()) @@ -857,6 +907,7 @@ def _check_method_works(self, method): method(self.strIndex) method(self.intIndex) method(self.tuples) + method(self.catIndex) def test_get_indexer(self): idx1 = Index([1, 2, 3, 4, 5]) @@ -1338,6 +1389,352 @@ def test_equals_op(self): index_b == index_a, ) +class TestCategoricalIndex(Base, tm.TestCase): + _holder = CategoricalIndex + + def setUp(self): + self.indices = dict(catIndex = tm.makeCategoricalIndex(100)) + self.setup_indices() + + def create_index(self, categories=None, ordered=False): + if categories is None: + categories = list('cab') + return CategoricalIndex(list('aabbca'), categories=categories, ordered=ordered) + + def test_construction(self): + + ci = self.create_index(categories=list('abcd')) + categories = ci.categories + + result = Index(ci) + tm.assert_index_equal(result,ci,exact=True) + self.assertFalse(result.ordered) + + result = Index(ci.values) + tm.assert_index_equal(result,ci,exact=True) + self.assertFalse(result.ordered) + + # empty + result = CategoricalIndex(categories=categories) + self.assertTrue(result.categories.equals(Index(categories))) + self.assert_numpy_array_equal(result.codes,np.array([],dtype='int8')) + self.assertFalse(result.ordered) + + # passing categories + result = CategoricalIndex(list('aabbca'),categories=categories) + self.assertTrue(result.categories.equals(Index(categories))) + self.assert_numpy_array_equal(result.codes,np.array([0,0,1,1,2,0],dtype='int8')) + + c = pd.Categorical(list('aabbca')) + result = CategoricalIndex(c) + self.assertTrue(result.categories.equals(Index(list('abc')))) + self.assert_numpy_array_equal(result.codes,np.array([0,0,1,1,2,0],dtype='int8')) + self.assertFalse(result.ordered) + + result = CategoricalIndex(c,categories=categories) + self.assertTrue(result.categories.equals(Index(categories))) + self.assert_numpy_array_equal(result.codes,np.array([0,0,1,1,2,0],dtype='int8')) + self.assertFalse(result.ordered) + + ci = CategoricalIndex(c,categories=list('abcd')) + result = CategoricalIndex(ci) + self.assertTrue(result.categories.equals(Index(categories))) + self.assert_numpy_array_equal(result.codes,np.array([0,0,1,1,2,0],dtype='int8')) + self.assertFalse(result.ordered) + + result = CategoricalIndex(ci, categories=list('ab')) + self.assertTrue(result.categories.equals(Index(list('ab')))) + self.assert_numpy_array_equal(result.codes,np.array([0,0,1,1,-1,0],dtype='int8')) + self.assertFalse(result.ordered) + + result = CategoricalIndex(ci, categories=list('ab'), ordered=True) + self.assertTrue(result.categories.equals(Index(list('ab')))) + self.assert_numpy_array_equal(result.codes,np.array([0,0,1,1,-1,0],dtype='int8')) + self.assertTrue(result.ordered) + + # turn me to an Index + result = Index(np.array(ci)) + self.assertIsInstance(result, Index) + self.assertNotIsInstance(result, CategoricalIndex) + + def test_construction_with_dtype(self): + + # specify dtype + ci = self.create_index(categories=list('abc')) + + result = Index(np.array(ci), dtype='category') + tm.assert_index_equal(result,ci,exact=True) + + result = Index(np.array(ci).tolist(), dtype='category') + tm.assert_index_equal(result,ci,exact=True) + + # these are generally only equal when the categories are reordered + ci = self.create_index() + + result = Index(np.array(ci), dtype='category').reorder_categories(ci.categories) + tm.assert_index_equal(result,ci,exact=True) + + # make sure indexes are handled + expected = CategoricalIndex([0,1,2], categories=[0,1,2], ordered=True) + idx = Index(range(3)) + result = CategoricalIndex(idx, categories=idx, ordered=True) + tm.assert_index_equal(result, expected, exact=True) + + def test_method_delegation(self): + + ci = CategoricalIndex(list('aabbca'), categories=list('cabdef')) + result = ci.set_categories(list('cab')) + tm.assert_index_equal(result, CategoricalIndex(list('aabbca'), categories=list('cab'))) + + ci = CategoricalIndex(list('aabbca'), categories=list('cab')) + result = ci.rename_categories(list('efg')) + tm.assert_index_equal(result, CategoricalIndex(list('ffggef'), categories=list('efg'))) + + ci = CategoricalIndex(list('aabbca'), categories=list('cab')) + result = ci.add_categories(['d']) + tm.assert_index_equal(result, CategoricalIndex(list('aabbca'), categories=list('cabd'))) + + ci = CategoricalIndex(list('aabbca'), categories=list('cab')) + result = ci.remove_categories(['c']) + tm.assert_index_equal(result, CategoricalIndex(list('aabb') + [np.nan] + ['a'], categories=list('ab'))) + + ci = CategoricalIndex(list('aabbca'), categories=list('cabdef')) + result = ci.as_unordered() + tm.assert_index_equal(result, ci) + + ci = CategoricalIndex(list('aabbca'), categories=list('cabdef')) + result = ci.as_ordered() + tm.assert_index_equal(result, CategoricalIndex(list('aabbca'), categories=list('cabdef'), ordered=True)) + + # invalid + self.assertRaises(ValueError, lambda : ci.set_categories(list('cab'), inplace=True)) + + def test_contains(self): + + ci = self.create_index(categories=list('cabdef')) + + self.assertTrue('a' in ci) + self.assertTrue('z' not in ci) + self.assertTrue('e' not in ci) + self.assertTrue(np.nan not in ci) + + # assert codes NOT in index + self.assertFalse(0 in ci) + self.assertFalse(1 in ci) + + ci = CategoricalIndex(list('aabbca'), categories=list('cabdef') + [np.nan]) + self.assertFalse(np.nan in ci) + + ci = CategoricalIndex(list('aabbca') + [np.nan], categories=list('cabdef') + [np.nan]) + self.assertTrue(np.nan in ci) + + def test_min_max(self): + + ci = self.create_index(ordered=False) + self.assertRaises(TypeError, lambda : ci.min()) + self.assertRaises(TypeError, lambda : ci.max()) + + ci = self.create_index(ordered=True) + + self.assertEqual(ci.min(),'c') + self.assertEqual(ci.max(),'b') + + def test_append(self): + + ci = self.create_index() + categories = ci.categories + + # append cats with the same categories + result = ci[:3].append(ci[3:]) + tm.assert_index_equal(result,ci,exact=True) + + foos = [ci[:1], ci[1:3], ci[3:]] + result = foos[0].append(foos[1:]) + tm.assert_index_equal(result,ci,exact=True) + + # empty + result = ci.append([]) + tm.assert_index_equal(result,ci,exact=True) + + # appending with different categories or reoreded is not ok + self.assertRaises(TypeError, lambda : ci.append(ci.values.set_categories(list('abcd')))) + self.assertRaises(TypeError, lambda : ci.append(ci.values.reorder_categories(list('abc')))) + + # with objects + result = ci.append(['c','a']) + expected = CategoricalIndex(list('aabbcaca'), categories=categories) + tm.assert_index_equal(result,expected,exact=True) + + # invalid objects + self.assertRaises(TypeError, lambda : ci.append(['a','d'])) + + def test_insert(self): + + ci = self.create_index() + categories = ci.categories + + #test 0th element + result = ci.insert(0, 'a') + expected = CategoricalIndex(list('aaabbca'),categories=categories) + tm.assert_index_equal(result,expected,exact=True) + + #test Nth element that follows Python list behavior + result = ci.insert(-1, 'a') + expected = CategoricalIndex(list('aabbcaa'),categories=categories) + tm.assert_index_equal(result,expected,exact=True) + + #test empty + result = CategoricalIndex(categories=categories).insert(0, 'a') + expected = CategoricalIndex(['a'],categories=categories) + tm.assert_index_equal(result,expected,exact=True) + + # invalid + self.assertRaises(TypeError, lambda : ci.insert(0,'d')) + + def test_delete(self): + + ci = self.create_index() + categories = ci.categories + + result = ci.delete(0) + expected = CategoricalIndex(list('abbca'),categories=categories) + tm.assert_index_equal(result,expected,exact=True) + + result = ci.delete(-1) + expected = CategoricalIndex(list('aabbc'),categories=categories) + tm.assert_index_equal(result,expected,exact=True) + + with tm.assertRaises((IndexError, ValueError)): + # either depeidnig on numpy version + result = ci.delete(10) + + def test_astype(self): + + ci = self.create_index() + result = ci.astype('category') + tm.assert_index_equal(result,ci,exact=True) + + result = ci.astype(object) + self.assertTrue(result.equals(Index(np.array(ci)))) + + # this IS equal, but not the same class + self.assertTrue(result.equals(ci)) + self.assertIsInstance(result, Index) + self.assertNotIsInstance(result, CategoricalIndex) + + def test_reindex_base(self): + + # determined by cat ordering + idx = self.create_index() + expected = np.array([4,0,1,5,2,3]) + + actual = idx.get_indexer(idx) + assert_array_equal(expected, actual) + + with tm.assertRaisesRegexp(ValueError, 'Invalid fill method'): + idx.get_indexer(idx, method='invalid') + + def test_reindexing(self): + + ci = self.create_index() + oidx = Index(np.array(ci)) + + for n in [1,2,5,len(ci)]: + finder = oidx[np.random.randint(0,len(ci),size=n)] + expected = oidx.get_indexer_non_unique(finder)[0] + + actual = ci.get_indexer(finder) + assert_array_equal(expected, actual) + + def test_duplicates(self): + + idx = CategoricalIndex([0, 0, 0]) + self.assertFalse(idx.is_unique) + self.assertTrue(idx.has_duplicates) + + def test_get_indexer(self): + + idx1 = CategoricalIndex(list('aabcde'),categories=list('edabc')) + idx2 = CategoricalIndex(list('abf')) + + for indexer in [idx2, list('abf'), Index(list('abf'))]: + r1 = idx1.get_indexer(idx2) + assert_almost_equal(r1, [0, 1, 2, -1]) + + self.assertRaises(NotImplementedError, lambda : idx2.get_indexer(idx1, method='pad')) + self.assertRaises(NotImplementedError, lambda : idx2.get_indexer(idx1, method='backfill')) + self.assertRaises(NotImplementedError, lambda : idx2.get_indexer(idx1, method='nearest')) + + def test_repr(self): + + ci = CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=True) + str(ci) + tm.assert_index_equal(eval(repr(ci)),ci,exact=True) + + # formatting + if compat.PY3: + str(ci) + else: + compat.text_type(ci) + + # long format + ci = CategoricalIndex(np.random.randint(0,5,size=100)) + result = str(ci) + tm.assert_index_equal(eval(repr(ci)),ci,exact=True) + + def test_isin(self): + + ci = CategoricalIndex(list('aabca') + [np.nan],categories=['c','a','b',np.nan]) + self.assert_numpy_array_equal(ci.isin(['c']),np.array([False,False,False,True,False,False])) + self.assert_numpy_array_equal(ci.isin(['c','a','b']),np.array([True]*5 + [False])) + self.assert_numpy_array_equal(ci.isin(['c','a','b',np.nan]),np.array([True]*6)) + + # mismatched categorical -> coerced to ndarray so doesn't matter + self.assert_numpy_array_equal(ci.isin(ci.set_categories(list('abcdefghi'))),np.array([True]*6)) + self.assert_numpy_array_equal(ci.isin(ci.set_categories(list('defghi'))),np.array([False]*5 + [True])) + + def test_identical(self): + + ci1 = CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=True) + ci2 = CategoricalIndex(['a', 'b'], categories=['a', 'b', 'c'], ordered=True) + self.assertTrue(ci1.identical(ci1)) + self.assertTrue(ci1.identical(ci1.copy())) + self.assertFalse(ci1.identical(ci2)) + + def test_equals(self): + + ci1 = CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=True) + ci2 = CategoricalIndex(['a', 'b'], categories=['a', 'b', 'c'], ordered=True) + + self.assertTrue(ci1.equals(ci1)) + self.assertFalse(ci1.equals(ci2)) + self.assertTrue(ci1.equals(ci1.astype(object))) + self.assertTrue(ci1.astype(object).equals(ci1)) + + self.assertTrue((ci1 == ci1).all()) + self.assertFalse((ci1 != ci1).all()) + self.assertFalse((ci1 > ci1).all()) + self.assertFalse((ci1 < ci1).all()) + self.assertTrue((ci1 <= ci1).all()) + self.assertTrue((ci1 >= ci1).all()) + + self.assertFalse((ci1 == 1).all()) + self.assertTrue((ci1 == Index(['a','b'])).all()) + self.assertTrue((ci1 == ci1.values).all()) + + # invalid comparisons + self.assertRaises(TypeError, lambda : ci1 == Index(['a','b','c'])) + self.assertRaises(TypeError, lambda : ci1 == ci2) + self.assertRaises(TypeError, lambda : ci1 == Categorical(ci1.values, ordered=False)) + self.assertRaises(TypeError, lambda : ci1 == Categorical(ci1.values, categories=list('abc'))) + + # tests + # make sure that we are testing for category inclusion properly + self.assertTrue(CategoricalIndex(list('aabca'),categories=['c','a','b']).equals(list('aabca'))) + self.assertTrue(CategoricalIndex(list('aabca'),categories=['c','a','b',np.nan]).equals(list('aabca'))) + + self.assertFalse(CategoricalIndex(list('aabca') + [np.nan],categories=['c','a','b',np.nan]).equals(list('aabca'))) + self.assertTrue(CategoricalIndex(list('aabca') + [np.nan],categories=['c','a','b',np.nan]).equals(list('aabca') + [np.nan])) class Numeric(Base): @@ -1417,18 +1814,13 @@ class TestFloat64Index(Numeric, tm.TestCase): _multiprocess_can_split_ = True def setUp(self): - self.mixed = Float64Index([1.5, 2, 3, 4, 5]) - self.float = Float64Index(np.arange(5) * 2.5) + self.indices = dict(mixed = Float64Index([1.5, 2, 3, 4, 5]), + float = Float64Index(np.arange(5) * 2.5)) + self.setup_indices() def create_index(self): return Float64Index(np.arange(5,dtype='float64')) - def test_hash_error(self): - with tm.assertRaisesRegexp(TypeError, - "unhashable type: %r" % - type(self.float).__name__): - hash(self.float) - def test_repr_roundtrip(self): for ind in (self.mixed, self.float): tm.assert_index_equal(eval(repr(ind)), ind) @@ -1594,7 +1986,8 @@ class TestInt64Index(Numeric, tm.TestCase): _multiprocess_can_split_ = True def setUp(self): - self.index = Int64Index(np.arange(0, 20, 2)) + self.indices = dict(index = Int64Index(np.arange(0, 20, 2))) + self.setup_indices() def create_index(self): return Int64Index(np.arange(5,dtype='int64')) @@ -1641,18 +2034,14 @@ def test_constructor_corner(self): with tm.assertRaisesRegexp(TypeError, 'casting'): Int64Index(arr_with_floats) - def test_hash_error(self): - with tm.assertRaisesRegexp(TypeError, - "unhashable type: %r" % - type(self.index).__name__): - hash(self.index) - def test_copy(self): i = Int64Index([], name='Foo') i_copy = i.copy() self.assertEqual(i_copy.name, 'Foo') def test_view(self): + super(TestInt64Index, self).test_view() + i = Int64Index([], name='Foo') i_view = i.view() self.assertEqual(i_view.name, 'Foo') @@ -2053,6 +2442,7 @@ def test_slice_keep_name(self): class DatetimeLike(Base): def test_view(self): + super(DatetimeLike, self).test_view() i = self.create_index() @@ -2068,6 +2458,10 @@ class TestDatetimeIndex(DatetimeLike, tm.TestCase): _holder = DatetimeIndex _multiprocess_can_split_ = True + def setUp(self): + self.indices = dict(index = tm.makeDateIndex(10)) + self.setup_indices() + def create_index(self): return date_range('20130101',periods=5) @@ -2186,6 +2580,10 @@ class TestPeriodIndex(DatetimeLike, tm.TestCase): _holder = PeriodIndex _multiprocess_can_split_ = True + def setUp(self): + self.indices = dict(index = tm.makePeriodIndex(10)) + self.setup_indices() + def create_index(self): return period_range('20130101',periods=5,freq='D') @@ -2220,6 +2618,10 @@ class TestTimedeltaIndex(DatetimeLike, tm.TestCase): _holder = TimedeltaIndex _multiprocess_can_split_ = True + def setUp(self): + self.indices = dict(index = tm.makeTimedeltaIndex(10)) + self.setup_indices() + def create_index(self): return pd.to_timedelta(range(5),unit='d') + pd.offsets.Hour(1) @@ -2294,9 +2696,10 @@ def setUp(self): major_labels = np.array([0, 0, 1, 2, 3, 3]) minor_labels = np.array([0, 1, 0, 1, 0, 1]) self.index_names = ['first', 'second'] - self.index = MultiIndex(levels=[major_axis, minor_axis], - labels=[major_labels, minor_labels], - names=self.index_names, verify_integrity=False) + self.indices = dict(index = MultiIndex(levels=[major_axis, minor_axis], + labels=[major_labels, minor_labels], + names=self.index_names, verify_integrity=False)) + self.setup_indices() def create_index(self): return self.index @@ -2332,13 +2735,7 @@ def test_labels_dtypes(self): self.assertTrue((i.labels[0]>=0).all()) self.assertTrue((i.labels[1]>=0).all()) - def test_hash_error(self): - with tm.assertRaisesRegexp(TypeError, - "unhashable type: %r" % - type(self.index).__name__): - hash(self.index) - - def test_set_names_and_rename(self): + def test_set_name_methods(self): # so long as these are synonyms, we don't need to test set_names self.assertEqual(self.index.rename, self.index.set_names) new_names = [name + "SUFFIX" for name in self.index_names] @@ -3838,7 +4235,7 @@ def test_reindex_level(self): assertRaisesRegexp(TypeError, "Fill method not supported", idx.reindex, idx, method='bfill', level='first') - def test_has_duplicates(self): + def test_duplicates(self): self.assertFalse(self.index.has_duplicates) self.assertTrue(self.index.append(self.index).has_duplicates) diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 5f109212add06..3872f79df7286 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -2366,6 +2366,7 @@ def test_dups_fancy_indexing(self): rows = ['C','B','E'] expected = DataFrame({'test' : [11,9,np.nan], 'test1': [7.,6,np.nan], 'other': ['d','c',np.nan]},index=rows) + result = df.ix[rows] assert_frame_equal(result, expected) @@ -4422,6 +4423,212 @@ def test_indexing_assignment_dict_already_exists(self): tm.assert_frame_equal(df, expected) + +class TestCategoricalIndex(tm.TestCase): + + def setUp(self): + + self.df = DataFrame({'A' : np.arange(6,dtype='int64'), + 'B' : Series(list('aabbca')).astype('category',categories=list('cab')) }).set_index('B') + self.df2 = DataFrame({'A' : np.arange(6,dtype='int64'), + 'B' : Series(list('aabbca')).astype('category',categories=list('cabe')) }).set_index('B') + self.df3 = DataFrame({'A' : np.arange(6,dtype='int64'), + 'B' : Series([1,1,2,1,3,2]).astype('category',categories=[3,2,1],ordered=True) }).set_index('B') + self.df4 = DataFrame({'A' : np.arange(6,dtype='int64'), + 'B' : Series([1,1,2,1,3,2]).astype('category',categories=[3,2,1],ordered=False) }).set_index('B') + + + def test_loc_scalar(self): + + result = self.df.loc['a'] + expected = DataFrame({'A' : [0,1,5], + 'B' : Series(list('aaa')).astype('category',categories=list('cab')) }).set_index('B') + assert_frame_equal(result, expected) + + + df = self.df.copy() + df.loc['a'] = 20 + expected = DataFrame({'A' : [20,20,2,3,4,20], + 'B' : Series(list('aabbca')).astype('category',categories=list('cab')) }).set_index('B') + assert_frame_equal(df, expected) + + # value not in the categories + self.assertRaises(KeyError, lambda : df.loc['d']) + + def f(): + df.loc['d'] = 10 + self.assertRaises(TypeError, f) + + def f(): + df.loc['d','A'] = 10 + self.assertRaises(TypeError, f) + + def f(): + df.loc['d','C'] = 10 + self.assertRaises(TypeError, f) + + def test_loc_listlike(self): + + # list of labels + result = self.df.loc[['c','a']] + expected = self.df.iloc[[4,0,1,5]] + assert_frame_equal(result, expected) + + result = self.df2.loc[['a','b','e']] + expected = DataFrame({'A' : [0,1,5,2,3,np.nan], + 'B' : Series(list('aaabbe')).astype('category',categories=list('cabe')) }).set_index('B') + assert_frame_equal(result, expected) + + # element in the categories but not in the values + self.assertRaises(KeyError, lambda : self.df2.loc['e']) + + # assign is ok + df = self.df2.copy() + df.loc['e'] = 20 + result = df.loc[['a','b','e']] + expected = DataFrame({'A' : [0,1,5,2,3,20], + 'B' : Series(list('aaabbe')).astype('category',categories=list('cabe')) }).set_index('B') + assert_frame_equal(result, expected) + + df = self.df2.copy() + result = df.loc[['a','b','e']] + expected = DataFrame({'A' : [0,1,5,2,3,np.nan], + 'B' : Series(list('aaabbe')).astype('category',categories=list('cabe')) }).set_index('B') + assert_frame_equal(result, expected) + + + # not all labels in the categories + self.assertRaises(KeyError, lambda : self.df2.loc[['a','d']]) + + def test_reindexing(self): + + # reindexing + # convert to a regular index + result = self.df2.reindex(['a','b','e']) + expected = DataFrame({'A' : [0,1,5,2,3,np.nan], + 'B' : Series(list('aaabbe')) }).set_index('B') + assert_frame_equal(result, expected) + + result = self.df2.reindex(['a','b']) + expected = DataFrame({'A' : [0,1,5,2,3], + 'B' : Series(list('aaabb')) }).set_index('B') + assert_frame_equal(result, expected) + + result = self.df2.reindex(['e']) + expected = DataFrame({'A' : [np.nan], + 'B' : Series(['e']) }).set_index('B') + assert_frame_equal(result, expected) + + result = self.df2.reindex(['d']) + expected = DataFrame({'A' : [np.nan], + 'B' : Series(['d']) }).set_index('B') + assert_frame_equal(result, expected) + + # since we are actually reindexing with a Categorical + # then return a Categorical + cats = list('cabe') + + result = self.df2.reindex(pd.Categorical(['a','d'],categories=cats)) + expected = DataFrame({'A' : [0,1,5,np.nan], + 'B' : Series(list('aaad')).astype('category',categories=cats) }).set_index('B') + assert_frame_equal(result, expected) + + result = self.df2.reindex(pd.Categorical(['a'],categories=cats)) + expected = DataFrame({'A' : [0,1,5], + 'B' : Series(list('aaa')).astype('category',categories=cats) }).set_index('B') + assert_frame_equal(result, expected) + + result = self.df2.reindex(['a','b','e']) + expected = DataFrame({'A' : [0,1,5,2,3,np.nan], + 'B' : Series(list('aaabbe')) }).set_index('B') + assert_frame_equal(result, expected) + + result = self.df2.reindex(['a','b']) + expected = DataFrame({'A' : [0,1,5,2,3], + 'B' : Series(list('aaabb')) }).set_index('B') + assert_frame_equal(result, expected) + + result = self.df2.reindex(['e']) + expected = DataFrame({'A' : [np.nan], + 'B' : Series(['e']) }).set_index('B') + assert_frame_equal(result, expected) + + # give back the type of categorical that we received + result = self.df2.reindex(pd.Categorical(['a','d'],categories=cats,ordered=True)) + expected = DataFrame({'A' : [0,1,5,np.nan], + 'B' : Series(list('aaad')).astype('category',categories=cats,ordered=True) }).set_index('B') + assert_frame_equal(result, expected) + + result = self.df2.reindex(pd.Categorical(['a','d'],categories=['a','d'])) + expected = DataFrame({'A' : [0,1,5,np.nan], + 'B' : Series(list('aaad')).astype('category',categories=['a','d']) }).set_index('B') + assert_frame_equal(result, expected) + + # passed duplicate indexers are not allowed + self.assertRaises(ValueError, lambda : self.df2.reindex(['a','a'])) + + # args NotImplemented ATM + self.assertRaises(NotImplementedError, lambda : self.df2.reindex(['a'],method='ffill')) + self.assertRaises(NotImplementedError, lambda : self.df2.reindex(['a'],level=1)) + self.assertRaises(NotImplementedError, lambda : self.df2.reindex(['a'],limit=2)) + + def test_loc_slice(self): + + # slicing + # not implemented ATM + # GH9748 + + self.assertRaises(TypeError, lambda : self.df.loc[1:5]) + + #result = df.loc[1:5] + #expected = df.iloc[[1,2,3,4]] + #assert_frame_equal(result, expected) + + def test_boolean_selection(self): + + df3 = self.df3 + df4 = self.df4 + + result = df3[df3.index == 'a'] + expected = df3.iloc[[]] + assert_frame_equal(result,expected) + + result = df4[df4.index == 'a'] + expected = df4.iloc[[]] + assert_frame_equal(result,expected) + + result = df3[df3.index == 1] + expected = df3.iloc[[0,1,3]] + assert_frame_equal(result,expected) + + result = df4[df4.index == 1] + expected = df4.iloc[[0,1,3]] + assert_frame_equal(result,expected) + + # since we have an ordered categorical + + # CategoricalIndex([1, 1, 2, 1, 3, 2], + # categories=[3, 2, 1], + # ordered=True, + # name=u'B') + result = df3[df3.index < 2] + expected = df3.iloc[[4]] + assert_frame_equal(result,expected) + + result = df3[df3.index > 1] + expected = df3.iloc[[]] + assert_frame_equal(result,expected) + + # unordered + # cannot be compared + + # CategoricalIndex([1, 1, 2, 1, 3, 2], + # categories=[3, 2, 1], + # ordered=False, + # name=u'B') + self.assertRaises(TypeError, lambda : df4[df4.index < 2]) + self.assertRaises(TypeError, lambda : df4[df4.index > 1]) + class TestSeriesNoneCoercion(tm.TestCase): EXPECTED_RESULTS = [ # For numeric series, we should coerce to NaN. diff --git a/pandas/util/testing.py b/pandas/util/testing.py index b4baedada46e1..ea7354a9334ff 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -25,11 +25,6 @@ import pandas as pd from pandas.core.common import is_sequence, array_equivalent, is_list_like -import pandas.core.index as index -import pandas.core.series as series -import pandas.core.frame as frame -import pandas.core.panel as panel -import pandas.core.panel4d as panel4d import pandas.compat as compat from pandas.compat import( filter, map, zip, range, unichr, lrange, lmap, lzip, u, callable, Counter, @@ -38,24 +33,12 @@ from pandas.computation import expressions as expr -from pandas import bdate_range -from pandas.tseries.index import DatetimeIndex -from pandas.tseries.tdi import TimedeltaIndex -from pandas.tseries.period import PeriodIndex +from pandas import (bdate_range, CategoricalIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex, + Index, MultiIndex, Series, DataFrame, Panel, Panel4D) from pandas.util.decorators import deprecate - from pandas import _testing - - from pandas.io.common import urlopen -Index = index.Index -MultiIndex = index.MultiIndex -Series = series.Series -DataFrame = frame.DataFrame -Panel = panel.Panel -Panel4D = panel4d.Panel4D - N = 30 K = 4 _RAISE_NETWORK_ERROR_DEFAULT = False @@ -550,16 +533,14 @@ def assert_equal(a, b, msg=""): assert a == b, "%s: %r != %r" % (msg.format(a,b), a, b) -def assert_index_equal(left, right): +def assert_index_equal(left, right, exact=False): assert_isinstance(left, Index, '[index] ') assert_isinstance(right, Index, '[index] ') - if not left.equals(right): + if not left.equals(right) or (exact and type(left) != type(right)): raise AssertionError("[index] left [{0} {1}], right [{2} {3}]".format(left.dtype, left, right, right.dtype)) - - def assert_attr_equal(attr, left, right): """checks attributes are equal. Both objects must have attribute.""" left_attr = getattr(left, attr) @@ -627,6 +608,7 @@ def assertNotIsInstance(obj, cls, msg=''): def assert_categorical_equal(res, exp): + if not array_equivalent(res.categories, exp.categories): raise AssertionError( 'categories not equivalent: {0} vs {1}.'.format(res.categories, @@ -827,6 +809,11 @@ def makeStringIndex(k=10): def makeUnicodeIndex(k=10): return Index(randu_array(nchars=10, size=k)) +def makeCategoricalIndex(k=10, n=3): + """ make a length k index or n categories """ + x = rands_array(nchars=4, size=n) + return CategoricalIndex(np.random.choice(x,k)) + def makeBoolIndex(k=10): if k == 1: return Index([True])