diff --git a/doc/source/10min.rst b/doc/source/10min.rst index 985f112979a7e..6424b82779f0f 100644 --- a/doc/source/10min.rst +++ b/doc/source/10min.rst @@ -66,7 +66,8 @@ Creating a ``DataFrame`` by passing a dict of objects that can be converted to s 'B' : pd.Timestamp('20130102'), 'C' : pd.Series(1,index=list(range(4)),dtype='float32'), 'D' : np.array([3] * 4,dtype='int32'), - 'E' : 'foo' }) + 'E' : pd.Categorical(["test","train","test","train"]), + 'F' : 'foo' }) df2 Having specific :ref:`dtypes ` @@ -635,6 +636,32 @@ the quarter end: ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9 ts.head() +Categoricals +------------ + +Since version 0.15, pandas can include categorical data in a `DataFrame`. For full docs, see the +:ref:`Categorical introduction ` and the :ref:`API documentation ` . + +.. ipython:: python + + df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']}) + + # convert the raw grades to a categorical + df["grade"] = pd.Categorical(df["raw_grade"]) + + # Alternative: df["grade"] = df["raw_grade"].astype("category") + df["grade"] + + # Rename the levels + df["grade"].cat.levels = ["very good", "good", "very bad"] + + # Reorder the levels and simultaneously add the missing levels + df["grade"].cat.reorder_levels(["very bad", "bad", "medium", "good", "very good"]) + df["grade"] + df.sort("grade") + df.groupby("grade").size() + + Plotting -------- diff --git a/doc/source/api.rst b/doc/source/api.rst index ec6e2aff870c6..158fe5624087e 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -528,11 +528,17 @@ and has the following usable methods and properties (all available as :toctree: generated/ Categorical - Categorical.from_codes Categorical.levels Categorical.ordered Categorical.reorder_levels Categorical.remove_unused_levels + +The following methods are considered API when using ``Categorical`` directly: + +.. autosummary:: + :toctree: generated/ + + Categorical.from_codes Categorical.min Categorical.max Categorical.mode @@ -547,7 +553,7 @@ the Categorical back to a numpy array, so levels and order information is not pr Categorical.__array__ To create compatibility with `pandas.Series` and `numpy` arrays, the following (non-API) methods -are also introduced. +are also introduced and available when ``Categorical`` is used directly. .. autosummary:: :toctree: generated/ @@ -563,7 +569,8 @@ are also introduced. Categorical.order Categorical.argsort Categorical.fillna - + Categorical.notnull + Categorical.isnull Plotting ~~~~~~~~ diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index c08351eb87a79..95229c4bef3a8 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -90,6 +90,7 @@ By using some special functions: df['group'] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels) df.head(10) +See :ref:`documentation ` for :func:`~pandas.cut`. `Categoricals` have a specific ``category`` :ref:`dtype `: @@ -210,11 +211,9 @@ Renaming levels is done by assigning new values to the ``Category.levels`` or Levels must be unique or a `ValueError` is raised: .. ipython:: python + :okexcept: - try: - s.cat.levels = [1,1,1] - except ValueError as e: - print("ValueError: " + str(e)) + s.cat.levels = [1,1,1] Appending levels can be done by assigning a levels list longer than the current levels: @@ -268,12 +267,11 @@ meaning and certain operations are possible. If the categorical is unordered, a raised. .. ipython:: python + :okexcept: s = pd.Series(pd.Categorical(["a","b","c","a"], ordered=False)) - try: - s.sort() - except TypeError as e: - print("TypeError: " + str(e)) + s.sort() + s = pd.Series(pd.Categorical(["a","b","c","a"], ordered=True)) s.sort() s @@ -331,6 +329,44 @@ Operations The following operations are possible with categorical data: +Comparing `Categoricals` with other objects is possible in two cases: + * comparing a `Categorical` to another `Categorical`, when `level` and `ordered` is the same or + * comparing a `Categorical` to a scalar. +All other comparisons will raise a TypeError. + +.. ipython:: python + + cat = pd.Series(pd.Categorical([1,2,3], levels=[3,2,1])) + cat + cat_base = pd.Series(pd.Categorical([2,2,2], levels=[3,2,1])) + cat_base + cat_base2 = pd.Series(pd.Categorical([2,2,2])) + cat_base2 + + cat > cat_base + cat > 2 + +This doesn't work because the levels are not the same + +.. ipython:: python + :okexcept: + + cat > cat_base2 + +.. note:: + + Comparisons with `Series`, `np.array` or a `Categorical` with different levels or ordering + will raise an `TypeError` because custom level ordering would result in two valid results: + one with taking in account the ordering and one without. If you want to compare a `Categorical` + with such a type, you need to be explicit and convert the `Categorical` to values: + +.. ipython:: python + :okexcept: + + base = np.array([1,2,3]) + cat > base + np.asarray(cat) > base + Getting the minimum and maximum, if the categorical is ordered: .. ipython:: python @@ -454,21 +490,22 @@ Setting values in a categorical column (or `Series`) works as long as the value df.iloc[2:4,:] = [["b",2],["b",2]] df - try: - df.iloc[2:4,:] = [["c",3],["c",3]] - except ValueError as e: - print("ValueError: " + str(e)) + +The value is not included in the levels here. + +.. ipython:: python + :okexcept: + + df.iloc[2:4,:] = [["c",3],["c",3]] Setting values by assigning a `Categorical` will also check that the `levels` match: .. ipython:: python + :okexcept: df.loc["j":"k","cats"] = pd.Categorical(["a","a"], levels=["a","b"]) df - try: - df.loc["j":"k","cats"] = pd.Categorical(["b","b"], levels=["a","b","c"]) - except ValueError as e: - print("ValueError: " + str(e)) + df.loc["j":"k","cats"] = pd.Categorical(["b","b"], levels=["a","b","c"]) Assigning a `Categorical` to parts of a column of other types will use the values: @@ -489,27 +526,30 @@ but the levels of these `Categoricals` need to be the same: .. ipython:: python - cat = pd.Categorical(["a","b"], levels=["a","b"]) - vals = [1,2] - df = pd.DataFrame({"cats":cat, "vals":vals}) - res = pd.concat([df,df]) - res - res.dtypes + cat = pd.Categorical(["a","b"], levels=["a","b"]) + vals = [1,2] + df = pd.DataFrame({"cats":cat, "vals":vals}) + res = pd.concat([df,df]) + res + res.dtypes - df_different = df.copy() - df_different["cats"].cat.levels = ["a","b","c"] + df_different = df.copy() + df_different["cats"].cat.levels = ["a","b","c"] - try: - pd.concat([df,df]) - except ValueError as e: - print("ValueError: " + str(e)) +These levels are not the same + +.. ipython:: python + :okexcept: + + pd.concat([df,df]) The same applies to ``df.append(df)``. Getting Data In/Out ------------------- -Writing data (`Series`, `Frames`) to a HDF store that contains a ``category`` dtype will currently raise ``NotImplementedError``. +Writing data (`Series`, `Frames`) to a HDF store that contains a ``category`` dtype will currently +raise ``NotImplementedError``. Writing to a CSV file will convert the data, effectively removing any information about the `Categorical` (levels and ordering). So if you read back the CSV file you have to convert the @@ -575,33 +615,26 @@ object and not as a low level `numpy` array dtype. This leads to some problems. `numpy` itself doesn't know about the new `dtype`: .. ipython:: python + :okexcept: - try: - np.dtype("category") - except TypeError as e: - print("TypeError: " + str(e)) + np.dtype("category") + dtype = pd.Categorical(["a"]).dtype + np.dtype(dtype) - dtype = pd.Categorical(["a"]).dtype - try: - np.dtype(dtype) - except TypeError as e: - print("TypeError: " + str(e)) - - # dtype comparisons work: - dtype == np.str_ - np.str_ == dtype + # dtype comparisons work: + dtype == np.str_ + np.str_ == dtype Using `numpy` functions on a `Series` of type ``category`` should not work as `Categoricals` are not numeric data (even in the case that ``.levels`` is numeric). .. ipython:: python + :okexcept: - s = pd.Series(pd.Categorical([1,2,3,4])) - try: - np.sum(s) - #same with np.log(s),.. - except TypeError as e: - print("TypeError: " + str(e)) + s = pd.Series(pd.Categorical([1,2,3,4])) + + #same with np.log(s),.. + np.sum(s) .. note:: If such a function works, please file a bug at https://github.com/pydata/pandas! @@ -647,14 +680,14 @@ Both `Series` and `Categorical` have a method ``.reorder_levels()`` but for diff Series of type ``category`` this means that there is some danger to confuse both methods. .. ipython:: python + :okexcept: s = pd.Series(pd.Categorical([1,2,3,4])) print(s.cat.levels) + # wrong and raises an error: - try: - s.reorder_levels([4,3,2,1]) - except Exception as e: - print("Exception: " + str(e)) + s.reorder_levels([4,3,2,1]) + # right s.cat.reorder_levels([4,3,2,1]) print(s.cat.levels) diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index 92a35d0276e22..3d40be37dbbb3 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -503,3 +503,10 @@ handling of NaN: pd.factorize(x, sort=True) np.unique(x, return_inverse=True)[::-1] + +.. note:: + If you just want to handle one column as a categorical variable (like R's factor), + you can use ``df["cat_col"] = pd.Categorical(df["col"])`` or + ``df["cat_col"] = df["col"].astype("category")``. For full docs on :class:`~pandas.Categorical`, + see the :ref:`Categorical introduction ` and the + :ref:`API documentation `. This feature was introduced in version 0.15. diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 322bcba9664d9..aa28796061599 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -226,7 +226,8 @@ Categoricals in Series/DataFrame methods to manipulate. Thanks to Jan Schultz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`, :issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`, :issue:`7914`). -For full docs, see the :ref:`Categorical introduction ` and the :ref:`API documentation `. +For full docs, see the :ref:`Categorical introduction ` and the +:ref:`API documentation `. .. ipython:: python diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index c9674aea4a715..43217f2abe240 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -2,12 +2,13 @@ import numpy as np from warnings import warn +import types from pandas import compat from pandas.compat import u -from pandas.core.algorithms import factorize, unique -from pandas.core.base import PandasObject +from pandas.core.algorithms import factorize +from pandas.core.base import PandasObject, PandasDelegate from pandas.core.index import Index, _ensure_index from pandas.core.indexing import _is_null_slice from pandas.tseries.period import PeriodIndex @@ -18,16 +19,36 @@ def _cat_compare_op(op): def f(self, other): - if isinstance(other, (Categorical, np.ndarray)): - values = np.asarray(self) - f = getattr(values, op) - return f(np.asarray(other)) - else: + # On python2, you can usually compare any type to any type, and Categoricals can be + # seen as a custom type, but having different results depending whether a level are + # the same or not is kind of insane, so be a bit stricter here and use the python3 idea + # of comparing only things of equal type. + if not self.ordered: + if op in ['__lt__', '__gt__','__le__','__ge__']: + raise TypeError("Unordered Categoricals can only compare equality or not") + if isinstance(other, Categorical): + # Two Categoricals can only be be compared if the levels are the same + if (len(self.levels) != len(other.levels)) or not ((self.levels == other.levels).all()): + raise TypeError("Categoricals can only be compared if 'levels' are the same") + if not (self.ordered == other.ordered): + raise TypeError("Categoricals can only be compared if 'ordered' is the same") + na_mask = (self._codes == -1) | (other._codes == -1) + f = getattr(self._codes, op) + ret = f(other._codes) + if na_mask.any(): + # In other series, the leads to False, so do that here too + ret[na_mask] = False + return ret + elif np.isscalar(other): if other in self.levels: i = self.levels.get_loc(other) return getattr(self._codes, op)(i) else: return np.repeat(False, len(self)) + else: + msg = "Cannot compare a Categorical for op {op} with type {typ}. If you want to \n" \ + "compare values, use 'np.asarray(cat) other'." + raise TypeError(msg.format(op=op,typ=type(other))) f.__name__ = op @@ -109,9 +130,9 @@ class Categorical(PandasObject): Attributes ---------- - levels : ndarray + levels : Index The levels of this categorical - codes : Index + codes : ndarray The codes (integer positions, which point to the levels) of this categorical, read only ordered : boolean Whether or not this Categorical is ordered @@ -171,6 +192,9 @@ class Categorical(PandasObject): Categorical.max """ + # For comparisons, so that numpy uses our implementation if the compare ops, which raise + __array_priority__ = 1000 + def __init__(self, values, levels=None, ordered=None, name=None, fastpath=False, compat=False): if fastpath: @@ -208,8 +232,23 @@ def __init__(self, values, levels=None, ordered=None, name=None, fastpath=False, # under certain versions of numpy as well inferred = com._possibly_infer_to_datetimelike(values) if not isinstance(inferred, np.ndarray): + + # isnull doesn't work with generators/xrange, so convert all to lists + if com._is_sequence(values) or isinstance(values, types.GeneratorType): + values = list(values) + elif np.isscalar(values): + values = [values] + from pandas.core.series import _sanitize_array - values = _sanitize_array(values, None) + # On list with NaNs, int values will be converted to float. Use "object" dtype + # to prevent this. In the end objects will be casted to int/... in the level + # assignment step. + # tuple are list_like but com.isnull() will return a single bool, + # which then raises an AttributeError: 'bool' object has no attribute 'any' + has_null = (com.is_list_like(values) and not isinstance(values, tuple) + and com.isnull(values).any()) + dtype = 'object' if has_null else None + values = _sanitize_array(values, None, dtype=dtype) if levels is None: try: @@ -277,7 +316,7 @@ def from_array(cls, data): return Categorical(data) @classmethod - def from_codes(cls, codes, levels, ordered=True, name=None): + def from_codes(cls, codes, levels, ordered=False, name=None): """ Make a Categorical type from codes and levels arrays. @@ -294,7 +333,7 @@ def from_codes(cls, codes, levels, ordered=True, name=None): The levels for the categorical. Items need to be unique. ordered : boolean, optional Whether or not this categorical is treated as a ordered categorical. If not given, - the resulting categorical will be ordered. + the resulting categorical will be unordered. name : str, optional Name for the Categorical variable. """ @@ -429,9 +468,13 @@ def __array__(self, dtype=None): Returns ------- values : numpy array - A numpy array of the same dtype as categorical.levels.dtype + A numpy array of either the specified dtype or, if dtype==None (default), the same + dtype as categorical.levels.dtype """ - return com.take_1d(self.levels.values, self._codes) + ret = com.take_1d(self.levels.values, self._codes) + if dtype and dtype != self.levels.dtype: + return np.asarray(ret, dtype) + return ret @property def T(self): @@ -503,10 +546,27 @@ def order(self, inplace=False, ascending=True, na_position='last', **kwargs): if na_position not in ['last','first']: raise ValueError('invalid na_position: {!r}'.format(na_position)) - codes = np.sort(self._codes.copy()) + codes = np.sort(self._codes) if not ascending: codes = codes[::-1] + # NaN handling + na_mask = (codes==-1) + if na_mask.any(): + n_nans = len(codes[na_mask]) + if na_position=="first" and not ascending: + # in this case sort to the front + new_codes = codes.copy() + new_codes[0:n_nans] = -1 + new_codes[n_nans:] = codes[~na_mask] + codes = new_codes + elif na_position=="last" and not ascending: + # ... and to the end + new_codes = codes.copy() + pos = len(codes)-n_nans + new_codes[0:pos] = codes[~na_mask] + new_codes[pos:] = -1 + codes = new_codes if inplace: self._codes = codes return @@ -542,6 +602,32 @@ def sort(self, inplace=True, ascending=True, na_position='last', **kwargs): """ return self.order(inplace=inplace, ascending=ascending, **kwargs) + def isnull(self): + """ + Returns + ------- + a boolean array of whether my values are null + + """ + + ret = self._codes == -1 + + # String/object and float levels can hold np.nan + if self.levels.dtype.kind in ['S', 'O', 'f']: + if np.nan in self.levels: + nan_pos = np.where(com.isnull(self.levels)) + ret = ret | self == nan_pos + return ret + + def notnull(self): + """ + Returns + ------- + a boolean array of whether my values are not null + + """ + return ~self.isnull() + def ravel(self, order='C'): """ Return a flattened (numpy) array. @@ -760,7 +846,8 @@ def __setitem__(self, key, value): rvalue = value if com.is_list_like(value) else [value] to_add = Index(rvalue)-self.levels - if len(to_add): + # no assignments of values not in levels, but it's always ok to set something to np.nan + if len(to_add) and not com.isnull(to_add).all(): raise ValueError("cannot setitem on a Categorical with a new level," " set the levels first") @@ -786,6 +873,13 @@ def __setitem__(self, key, value): key = self._codes[key] lindexer = self.levels.get_indexer(rvalue) + + # float levels do currently return -1 for np.nan, even if np.nan is included in the index + # "repair" this here + if com.isnull(rvalue).any() and com.isnull(self.levels).any(): + nan_pos = np.where(com.isnull(self.levels)) + lindexer[lindexer == -1] = nan_pos + self._codes[key] = lindexer #### reduction ops #### @@ -916,16 +1010,67 @@ def describe(self): 'values' : self._codes } ).groupby('codes').count() - counts.index = self.levels.take(counts.index) - counts = counts.reindex(self.levels) freqs = counts / float(counts.sum()) from pandas.tools.merge import concat result = concat([counts,freqs],axis=1) - result.index.name = 'levels' result.columns = ['counts','freqs'] + + # fill in the real levels + check = result.index == -1 + if check.any(): + # Sort -1 (=NaN) to the last position + index = np.arange(0, len(self.levels)+1) + index[-1] = -1 + result = result.reindex(index) + # build new index + levels = np.arange(0,len(self.levels)+1 ,dtype=object) + levels[:-1] = self.levels + levels[-1] = np.nan + result.index = levels.take(result.index) + else: + result.index = self.levels.take(result.index) + result = result.reindex(self.levels) + result.index.name = 'levels' + return result +##### The Series.cat accessor ##### + +class CategoricalProperties(PandasDelegate): + """ + Accessor object for categorical properties of the Series values. + + Examples + -------- + >>> s.cat.levels + >>> s.cat.levels = list('abc') + >>> s.cat.reorder_levels('cab') + + Allows accessing to specific getter and access methods + """ + + def __init__(self, values, index): + self.categorical = values + self.index = index + + def _delegate_property_get(self, name): + return getattr(self.categorical, name) + + def _delegate_property_set(self, name, new_values): + return setattr(self.categorical, name, new_values) + + def _delegate_method(self, name, *args, **kwargs): + method = getattr(self.categorical, name) + return method(*args, **kwargs) + +CategoricalProperties._add_delegate_accessors(delegate=Categorical, + accessors=["levels", "codes", "ordered"], + typ='property') +CategoricalProperties._add_delegate_accessors(delegate=Categorical, + accessors=["reorder_levels", "remove_unused_levels"], + typ='method') + ##### utility routines ##### def _get_codes_for_values(values, levels): diff --git a/pandas/core/common.py b/pandas/core/common.py index bc4c95ed3323e..8ec47a94d3c73 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -276,15 +276,22 @@ def _isnull_ndarraylike(obj): dtype = values.dtype if dtype.kind in ('O', 'S', 'U'): - # Working around NumPy ticket 1542 - shape = values.shape - - if dtype.kind in ('S', 'U'): - result = np.zeros(values.shape, dtype=bool) + if is_categorical_dtype(values): + from pandas import Categorical + if not isinstance(values, Categorical): + values = values.values + result = values.isnull() else: - result = np.empty(shape, dtype=bool) - vec = lib.isnullobj(values.ravel()) - result[...] = vec.reshape(shape) + + # Working around NumPy ticket 1542 + shape = values.shape + + if dtype.kind in ('S', 'U'): + result = np.zeros(values.shape, dtype=bool) + else: + result = np.empty(shape, dtype=bool) + vec = lib.isnullobj(values.ravel()) + result[...] = vec.reshape(shape) elif dtype in _DATELIKE_DTYPES: # this is the NaT pattern @@ -299,7 +306,6 @@ def _isnull_ndarraylike(obj): return result - def _isnull_ndarraylike_old(obj): values = getattr(obj, 'values', obj) dtype = values.dtype @@ -2448,7 +2454,7 @@ def _get_callable_name(obj): # instead of the empty string in this case to allow # distinguishing between no name and a name of '' return None - + _string_dtypes = frozenset(map(_get_dtype_from_object, (compat.binary_type, compat.text_type))) diff --git a/pandas/core/format.py b/pandas/core/format.py index 8f749d07296a7..0539d803a42a4 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -177,7 +177,7 @@ def _get_footer(self): # level infos are added to the end and in a new line, like it is done for Categoricals # Only added when we request a name if self.name and com.is_categorical_dtype(self.series.dtype): - level_info = self.series.cat._repr_level_info() + level_info = self.series.values._repr_level_info() if footer: footer += "\n" footer += level_info diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 9f29570af6f4f..de3b8d857617f 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -524,6 +524,10 @@ def _comp_method_SERIES(op, name, str_rep, masker=False): code duplication. """ def na_op(x, y): + if com.is_categorical_dtype(x) != com.is_categorical_dtype(y): + msg = "Cannot compare a Categorical for op {op} with type {typ}. If you want to \n" \ + "compare values, use 'series np.asarray(cat)'." + raise TypeError(msg.format(op=op,typ=type(y))) if x.dtype == np.object_: if isinstance(y, list): y = lib.list_to_object_array(y) @@ -555,11 +559,16 @@ def wrapper(self, other): index=self.index, name=name) elif isinstance(other, pd.DataFrame): # pragma: no cover return NotImplemented - elif isinstance(other, (pa.Array, pd.Series, pd.Index)): + elif isinstance(other, (pa.Array, pd.Index)): if len(self) != len(other): raise ValueError('Lengths must match to compare') return self._constructor(na_op(self.values, np.asarray(other)), index=self.index).__finalize__(self) + elif isinstance(other, pd.Categorical): + if not com.is_categorical_dtype(self): + msg = "Cannot compare a Categorical for op {op} with Series of dtype {typ}.\n"\ + "If you want to compare values, use 'series np.asarray(other)'." + raise TypeError(msg.format(op=op,typ=self.dtype)) else: mask = isnull(self) diff --git a/pandas/core/series.py b/pandas/core/series.py index 5a490992c478c..ef6bdf99915b1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -900,7 +900,7 @@ def _repr_footer(self): # Categorical if com.is_categorical_dtype(self.dtype): - level_info = self.cat._repr_level_info() + level_info = self.values._repr_level_info() return u('%sLength: %d, dtype: %s\n%s') % (namestr, len(self), str(self.dtype.name), @@ -2415,11 +2415,12 @@ def dt(self): #------------------------------------------------------------------------------ # Categorical methods - @property + @cache_readonly def cat(self): + from pandas.core.categorical import CategoricalProperties if not com.is_categorical_dtype(self.dtype): raise TypeError("Can only use .cat accessor with a 'category' dtype") - return self.values + return CategoricalProperties(self.values, self.index) Series._setup_axes(['index'], info_axis=0, stat_axis=0, aliases={'rows': 0}) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 421e05f5a3bc7..dbe7aad723ee7 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -111,6 +111,50 @@ def test_constructor(self): cat = pd.Categorical([1,2,3,np.nan], levels=[1,2,3]) self.assertTrue(com.is_integer_dtype(cat.levels)) + # https://github.com/pydata/pandas/issues/3678 + cat = pd.Categorical([np.nan,1, 2, 3]) + self.assertTrue(com.is_integer_dtype(cat.levels)) + + # this should result in floats + cat = pd.Categorical([np.nan, 1, 2., 3 ]) + self.assertTrue(com.is_float_dtype(cat.levels)) + + cat = pd.Categorical([np.nan, 1., 2., 3. ]) + self.assertTrue(com.is_float_dtype(cat.levels)) + + # corner cases + cat = pd.Categorical([1]) + self.assertTrue(len(cat.levels) == 1) + self.assertTrue(cat.levels[0] == 1) + self.assertTrue(len(cat.codes) == 1) + self.assertTrue(cat.codes[0] == 0) + + cat = pd.Categorical(["a"]) + self.assertTrue(len(cat.levels) == 1) + self.assertTrue(cat.levels[0] == "a") + self.assertTrue(len(cat.codes) == 1) + self.assertTrue(cat.codes[0] == 0) + + # Scalars should be converted to lists + cat = pd.Categorical(1) + self.assertTrue(len(cat.levels) == 1) + self.assertTrue(cat.levels[0] == 1) + self.assertTrue(len(cat.codes) == 1) + self.assertTrue(cat.codes[0] == 0) + + + def test_constructor_with_generator(self): + # This was raising an Error in isnull(single_val).any() because isnull returned a scalar + # for a generator + + a = (a for x in [1,2]) + cat = Categorical(a) + + # This does actually a xrange, which is a sequence instead of a generator + from pandas.core.index import MultiIndex + MultiIndex.from_product([range(5), ['a', 'b', 'c']]) + + def test_from_codes(self): # too few levels @@ -134,7 +178,7 @@ def f(): self.assertRaises(ValueError, f) - exp = Categorical(["a","b","c"]) + exp = Categorical(["a","b","c"], ordered=False) res = Categorical.from_codes([0,1,2], ["a","b","c"]) self.assertTrue(exp.equals(res)) @@ -179,6 +223,63 @@ def test_comparisons(self): expected = np.repeat(False, len(self.factor)) self.assert_numpy_array_equal(result, expected) + # comparisons with categoricals + cat_rev = pd.Categorical(["a","b","c"], levels=["c","b","a"]) + cat_rev_base = pd.Categorical(["b","b","b"], levels=["c","b","a"]) + cat = pd.Categorical(["a","b","c"]) + cat_base = pd.Categorical(["b","b","b"], levels=cat.levels) + + # comparisons need to take level ordering into account + res_rev = cat_rev > cat_rev_base + exp_rev = np.array([True, False, False]) + self.assert_numpy_array_equal(res_rev, exp_rev) + + res_rev = cat_rev < cat_rev_base + exp_rev = np.array([False, False, True]) + self.assert_numpy_array_equal(res_rev, exp_rev) + + res = cat > cat_base + exp = np.array([False, False, True]) + self.assert_numpy_array_equal(res, exp) + + # Only categories with same levels can be compared + def f(): + cat > cat_rev + self.assertRaises(TypeError, f) + + cat_rev_base2 = pd.Categorical(["b","b","b"], levels=["c","b","a","d"]) + def f(): + cat_rev > cat_rev_base2 + self.assertRaises(TypeError, f) + + # Only categories with same ordering information can be compared + cat_unorderd = cat.copy() + cat_unorderd.ordered = False + self.assertFalse((cat > cat).any()) + def f(): + cat > cat_unorderd + self.assertRaises(TypeError, f) + + # comparison (in both directions) with Series will raise + s = Series(["b","b","b"]) + self.assertRaises(TypeError, lambda: cat > s) + self.assertRaises(TypeError, lambda: cat_rev > s) + self.assertRaises(TypeError, lambda: s < cat) + self.assertRaises(TypeError, lambda: s < cat_rev) + + # comparison with numpy.array will raise in both direction, but only on newer + # numpy versions + a = np.array(["b","b","b"]) + self.assertRaises(TypeError, lambda: cat > a) + self.assertRaises(TypeError, lambda: cat_rev > a) + + # The following work via '__array_priority__ = 1000' + # and py3_2 is not friendly + tm._skip_if_not_numpy17_friendly() + if not compat.PY3_2: + self.assertRaises(TypeError, lambda: a < cat) + self.assertRaises(TypeError, lambda: a < cat_rev) + def test_na_flags_int_levels(self): # #1457 @@ -205,6 +306,16 @@ def test_describe(self): ).set_index('levels') tm.assert_frame_equal(desc, expected) + # check unused levels + cat = self.factor.copy() + cat.levels = ["a","b","c","d"] + desc = cat.describe() + expected = DataFrame.from_dict(dict(counts=[3, 2, 3, np.nan], + freqs=[3/8., 2/8., 3/8., np.nan], + levels=['a', 'b', 'c', 'd']) + ).set_index('levels') + tm.assert_frame_equal(desc, expected) + # check an integer one desc = Categorical([1,2,3,1,2,3,3,2,1,1,1]).describe() expected = DataFrame.from_dict(dict(counts=[5, 3, 3], @@ -214,6 +325,47 @@ def test_describe(self): ).set_index('levels') tm.assert_frame_equal(desc, expected) + # https://github.com/pydata/pandas/issues/3678 + # describe should work with NaN + cat = pd.Categorical([np.nan,1, 2, 2]) + desc = cat.describe() + expected = DataFrame.from_dict(dict(counts=[1, 2, 1], + freqs=[1/4., 2/4., 1/4.], + levels=[1,2,np.nan] + ) + ).set_index('levels') + tm.assert_frame_equal(desc, expected) + + # having NaN as level and as "not available" should also print two NaNs in describe! + cat = pd.Categorical([np.nan,1, 2, 2]) + cat.levels = [1,2,np.nan] + desc = cat.describe() + expected = DataFrame.from_dict(dict(counts=[1, 2, np.nan, 1], + freqs=[1/4., 2/4., np.nan, 1/4.], + levels=[1,2,np.nan,np.nan] + ) + ).set_index('levels') + tm.assert_frame_equal(desc, expected) + + # empty levels show up as NA + cat = Categorical(["a","b","b","b"], levels=['a','b','c'], ordered=True) + result = cat.describe() + + expected = DataFrame([[1,0.25],[3,0.75],[np.nan,np.nan]], + columns=['counts','freqs'], + index=Index(['a','b','c'],name='levels')) + tm.assert_frame_equal(result,expected) + + # NA as a level + cat = pd.Categorical(["a","c","c",np.nan], levels=["b","a","c",np.nan] ) + result = cat.describe() + + expected = DataFrame([[np.nan, np.nan],[1,0.25],[2,0.5], [1,0.25]], + columns=['counts','freqs'], + index=Index(['b','a','c',np.nan],name='levels')) + tm.assert_frame_equal(result,expected) + + def test_print(self): expected = [" a", " b", " b", " a", " a", " c", " c", " c", "Levels (3, object): [a < b < c]"] @@ -496,6 +648,44 @@ def test_slicing_directly(self): self.assert_numpy_array_equal(sliced._codes, expected._codes) tm.assert_index_equal(sliced.levels, expected.levels) + def test_set_item_nan(self): + cat = pd.Categorical([1,2,3]) + exp = pd.Categorical([1,np.nan,3], levels=[1,2,3]) + cat[1] = np.nan + self.assertTrue(cat.equals(exp)) + + # if nan in levels, the proper code should be set! + cat = pd.Categorical([1,2,3, np.nan], levels=[1,2,3]) + cat.levels = [1,2,3, np.nan] + cat[1] = np.nan + exp = np.array([0,3,2,-1]) + self.assert_numpy_array_equal(cat.codes, exp) + + cat = pd.Categorical([1,2,3, np.nan], levels=[1,2,3]) + cat.levels = [1,2,3, np.nan] + cat[1:3] = np.nan + exp = np.array([0,3,3,-1]) + self.assert_numpy_array_equal(cat.codes, exp) + + cat = pd.Categorical([1,2,3, np.nan], levels=[1,2,3]) + cat.levels = [1,2,3, np.nan] + cat[1:3] = [np.nan, 1] + exp = np.array([0,3,0,-1]) + self.assert_numpy_array_equal(cat.codes, exp) + + cat = pd.Categorical([1,2,3, np.nan], levels=[1,2,3]) + cat.levels = [1,2,3, np.nan] + cat[1:3] = [np.nan, np.nan] + exp = np.array([0,3,3,-1]) + self.assert_numpy_array_equal(cat.codes, exp) + + cat = pd.Categorical([1,2,3, np.nan], levels=[1,2,3]) + cat.levels = [1,2,3, np.nan] + cat[pd.isnull(cat)] = np.nan + exp = np.array([0,1,2,3]) + self.assert_numpy_array_equal(cat.codes, exp) + + class TestCategoricalAsBlock(tm.TestCase): _multiprocess_can_split_ = True @@ -616,7 +806,7 @@ def test_sideeffects_free(self): # so this WILL change values cat = Categorical(["a","b","c","a"]) s = pd.Series(cat) - self.assertTrue(s.cat is cat) + self.assertTrue(s.values is cat) s.cat.levels = [1,2,3] exp_s = np.array([1,2,3,1]) self.assert_numpy_array_equal(s.__array__(), exp_s) @@ -632,20 +822,20 @@ def test_nan_handling(self): # Nans are represented as -1 in labels s = Series(Categorical(["a","b",np.nan,"a"])) self.assert_numpy_array_equal(s.cat.levels, np.array(["a","b"])) - self.assert_numpy_array_equal(s.cat._codes, np.array([0,1,-1,0])) + self.assert_numpy_array_equal(s.cat.codes, np.array([0,1,-1,0])) # If levels have nan included, the label should point to that instead s2 = Series(Categorical(["a","b",np.nan,"a"], levels=["a","b",np.nan])) self.assert_numpy_array_equal(s2.cat.levels, np.array(["a","b",np.nan], dtype=np.object_)) - self.assert_numpy_array_equal(s2.cat._codes, np.array([0,1,2,0])) + self.assert_numpy_array_equal(s2.cat.codes, np.array([0,1,2,0])) # Changing levels should also make the replaced level np.nan s3 = Series(Categorical(["a","b","c","a"])) s3.cat.levels = ["a","b",np.nan] self.assert_numpy_array_equal(s3.cat.levels, np.array(["a","b",np.nan], dtype=np.object_)) - self.assert_numpy_array_equal(s3.cat._codes, np.array([0,1,2,0])) + self.assert_numpy_array_equal(s3.cat.codes, np.array([0,1,2,0])) def test_sequence_like(self): @@ -655,8 +845,8 @@ def test_sequence_like(self): df['grade'] = Categorical(df['raw_grade']) # basic sequencing testing - result = list(df.grade.cat) - expected = np.array(df.grade.cat).tolist() + result = list(df.grade.values) + expected = np.array(df.grade.values).tolist() tm.assert_almost_equal(result,expected) # iteration @@ -698,7 +888,7 @@ def test_series_delegations(self): exp_values = np.array(["a","b","c","a"]) s.cat.reorder_levels(["c","b","a"]) self.assert_numpy_array_equal(s.cat.levels, exp_levels) - self.assert_numpy_array_equal(s.cat.__array__(), exp_values) + self.assert_numpy_array_equal(s.values.__array__(), exp_values) self.assert_numpy_array_equal(s.__array__(), exp_values) # remove unused levels @@ -707,7 +897,7 @@ def test_series_delegations(self): exp_values = np.array(["a","b","b","a"]) s.cat.remove_unused_levels() self.assert_numpy_array_equal(s.cat.levels, exp_levels) - self.assert_numpy_array_equal(s.cat.__array__(), exp_values) + self.assert_numpy_array_equal(s.values.__array__(), exp_values) self.assert_numpy_array_equal(s.__array__(), exp_values) # This method is likely to be confused, so test that it raises an error on wrong inputs: @@ -716,6 +906,16 @@ def f(): self.assertRaises(Exception, f) # right: s.cat.reorder_levels([4,3,2,1]) + # test the tab completion display + ok_for_cat = ['levels','codes','ordered','reorder_levels','remove_unused_levels'] + def get_dir(s): + results = [ r for r in s.cat.__dir__() if not r.startswith('_') ] + return list(sorted(set(results))) + + s = Series(list('aabbcde')).astype('category') + results = get_dir(s) + tm.assert_almost_equal(results,list(sorted(set(ok_for_cat)))) + def test_series_functions_no_warnings(self): df = pd.DataFrame({'value': np.random.randint(0, 100, 20)}) labels = [ "{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)] @@ -766,31 +966,16 @@ def test_describe(self): result = self.cat.describe() self.assertEquals(len(result.columns),1) - # empty levels show up as NA - s = Series(Categorical(["a","b","b","b"], levels=['a','b','c'], ordered=True)) - result = s.cat.describe() - expected = DataFrame([[1,0.25],[3,0.75],[np.nan,np.nan]], - columns=['counts','freqs'], - index=Index(['a','b','c'],name='levels')) - tm.assert_frame_equal(result,expected) + # In a frame, describe() for the cat should be the same as for string arrays (count, unique, + # top, freq) + cat = Categorical(["a","b","b","b"], levels=['a','b','c'], ordered=True) + s = Series(cat) result = s.describe() expected = Series([4,2,"b",3],index=['count','unique','top', 'freq']) tm.assert_series_equal(result,expected) - # NA as a level - cat = pd.Categorical(["a","c","c",np.nan], levels=["b","a","c",np.nan] ) - result = cat.describe() - - expected = DataFrame([[np.nan, np.nan],[1,0.25],[2,0.5], [1,0.25]], - columns=['counts','freqs'], - index=Index(['b','a','c',np.nan],name='levels')) - tm.assert_frame_equal(result,expected) - - - # In a frame, describe() for the cat should be the same as for string arrays (count, unique, - # top, freq) cat = pd.Series(pd.Categorical(["a","b","c","c"])) df3 = pd.DataFrame({"cat":cat, "s":["a","b","c","c"]}) res = df3.describe() @@ -970,7 +1155,7 @@ def test_sort(self): # Cats must be sorted in a dataframe res = df.sort(columns=["string"], ascending=False) exp = np.array(["d", "c", "b", "a"]) - self.assert_numpy_array_equal(res["sort"].cat.__array__(), exp) + self.assert_numpy_array_equal(res["sort"].values.__array__(), exp) self.assertEqual(res["sort"].dtype, "category") res = df.sort(columns=["sort"], ascending=False) @@ -1013,17 +1198,29 @@ def f(): res = cat.order(ascending=False, na_position='last') exp_val = np.array(["d","c","b","a", np.nan],dtype=object) exp_levels = np.array(["a","b","c","d"],dtype=object) - # FIXME: IndexError: Out of bounds on buffer access (axis 0) - #self.assert_numpy_array_equal(res.__array__(), exp_val) - #self.assert_numpy_array_equal(res.levels, exp_levels) + self.assert_numpy_array_equal(res.__array__(), exp_val) + self.assert_numpy_array_equal(res.levels, exp_levels) + + cat = Categorical(["a","c","b","d", np.nan], ordered=True) + res = cat.order(ascending=False, na_position='first') + exp_val = np.array([np.nan, "d","c","b","a"],dtype=object) + exp_levels = np.array(["a","b","c","d"],dtype=object) + self.assert_numpy_array_equal(res.__array__(), exp_val) + self.assert_numpy_array_equal(res.levels, exp_levels) cat = Categorical(["a","c","b","d", np.nan], ordered=True) res = cat.order(ascending=False, na_position='first') exp_val = np.array([np.nan, "d","c","b","a"],dtype=object) exp_levels = np.array(["a","b","c","d"],dtype=object) - # FIXME: IndexError: Out of bounds on buffer access (axis 0) - #self.assert_numpy_array_equal(res.__array__(), exp_val) - #self.assert_numpy_array_equal(res.levels, exp_levels) + self.assert_numpy_array_equal(res.__array__(), exp_val) + self.assert_numpy_array_equal(res.levels, exp_levels) + + cat = Categorical(["a","c","b","d", np.nan], ordered=True) + res = cat.order(ascending=False, na_position='last') + exp_val = np.array(["d","c","b","a",np.nan],dtype=object) + exp_levels = np.array(["a","b","c","d"],dtype=object) + self.assert_numpy_array_equal(res.__array__(), exp_val) + self.assert_numpy_array_equal(res.levels, exp_levels) def test_slicing(self): cat = Series(Categorical([1,2,3,4])) @@ -1473,6 +1670,63 @@ def f(): df.loc[2:3,"b"] = pd.Categorical(["b","b"], levels=["a","b"]) tm.assert_frame_equal(df, exp) + # ensure that one can set something to np.nan + s = Series(Categorical([1,2,3])) + exp = Series(Categorical([1,np.nan,3])) + s[1] = np.nan + tm.assert_series_equal(s, exp) + + def test_comparisons(self): + tests_data = [(list("abc"), list("cba"), list("bbb")), + ([1,2,3], [3,2,1], [2,2,2])] + for data , reverse, base in tests_data: + cat_rev = pd.Series(pd.Categorical(data, levels=reverse)) + cat_rev_base = pd.Series(pd.Categorical(base, levels=reverse)) + cat = pd.Series(pd.Categorical(data)) + cat_base = pd.Series(pd.Categorical(base, levels=cat.cat.levels)) + s = Series(base) + a = np.array(base) + + # comparisons need to take level ordering into account + res_rev = cat_rev > cat_rev_base + exp_rev = Series([True, False, False]) + tm.assert_series_equal(res_rev, exp_rev) + + res_rev = cat_rev < cat_rev_base + exp_rev = Series([False, False, True]) + tm.assert_series_equal(res_rev, exp_rev) + + res = cat > cat_base + exp = Series([False, False, True]) + tm.assert_series_equal(res, exp) + + # Only categories with same levels can be compared + def f(): + cat > cat_rev + self.assertRaises(TypeError, f) + + # categorical cannot be compared to Series or numpy array, and also not the other way + # around + self.assertRaises(TypeError, lambda: cat > s) + self.assertRaises(TypeError, lambda: cat_rev > s) + self.assertRaises(TypeError, lambda: cat > a) + self.assertRaises(TypeError, lambda: cat_rev > a) + + self.assertRaises(TypeError, lambda: s < cat) + self.assertRaises(TypeError, lambda: s < cat_rev) + + self.assertRaises(TypeError, lambda: a < cat) + self.assertRaises(TypeError, lambda: a < cat_rev) + + # Categoricals can be compared to scalar values + res = cat_rev > base[0] + tm.assert_series_equal(res, exp) + + # And test NaN handling... + cat = pd.Series(pd.Categorical(["a","b","c", np.nan])) + exp = Series([True, True, True, False]) + res = (cat == cat) + tm.assert_series_equal(res, exp) def test_concat(self): cat = pd.Categorical(["a","b"], levels=["a","b"])