diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index b0267c3dc5163..9279d8b0288c4 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -116,7 +116,8 @@ Categoricals in Series/DataFrame ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :class:`~pandas.Categorical` can now be included in `Series` and `DataFrames` and gained new -methods to manipulate. Thanks to Jan Schultz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`, :issue:`7444`, :issue:`7839`). +methods to manipulate. Thanks to Jan Schultz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`, +:issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`). For full docs, see the :ref:`Categorical introduction ` and the :ref:`API documentation `. diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index d049a6d64aac3..f9ed6c2fecc3c 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -16,7 +16,6 @@ from pandas.core.config import get_option from pandas.core import format as fmt - def _cat_compare_op(op): def f(self, other): if isinstance(other, (Categorical, np.ndarray)): @@ -45,16 +44,6 @@ def _maybe_to_categorical(array): return array -def _get_codes_for_values(values, levels): - from pandas.core.algorithms import _get_data_algo, _hashtables - if values.dtype != levels.dtype: - values = com._ensure_object(values) - levels = com._ensure_object(levels) - (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables) - t = hash_klass(len(levels)) - t.map_locations(levels) - return com._ensure_platform_int(t.lookup(values)) - _codes_doc = """The level codes of this categorical. Level codes are an array if integer which are the positions of the real @@ -484,7 +473,7 @@ def argsort(self, ascending=True, **kwargs): result = result[::-1] return result - def order(self, inplace=False, ascending=True, **kwargs): + def order(self, inplace=False, ascending=True, na_position='last', **kwargs): """ Sorts the Category by level value returning a new Categorical by default. Only ordered Categoricals can be sorted! @@ -495,11 +484,11 @@ def order(self, inplace=False, ascending=True, **kwargs): ---------- ascending : boolean, default True Sort ascending. Passing False sorts descending + inplace : boolean, default False + Do operation in place. na_position : {'first', 'last'} (optional, default='last') 'first' puts NaNs at the beginning 'last' puts NaNs at the end - inplace : boolean, default False - Do operation in place. Returns ------- @@ -511,18 +500,22 @@ def order(self, inplace=False, ascending=True, **kwargs): """ if not self.ordered: raise TypeError("Categorical not ordered") - _sorted = np.sort(self._codes.copy()) + if na_position not in ['last','first']: + raise ValueError('invalid na_position: {!r}'.format(na_position)) + + codes = np.sort(self._codes.copy()) if not ascending: - _sorted = _sorted[::-1] + codes = codes[::-1] + if inplace: - self._codes = _sorted + self._codes = codes return else: - return Categorical(values=_sorted,levels=self.levels, ordered=self.ordered, + return Categorical(values=codes,levels=self.levels, ordered=self.ordered, name=self.name, fastpath=True) - def sort(self, inplace=True, ascending=True, **kwargs): + def sort(self, inplace=True, ascending=True, na_position='last', **kwargs): """ Sorts the Category inplace by level value. Only ordered Categoricals can be sorted! @@ -533,11 +526,11 @@ def sort(self, inplace=True, ascending=True, **kwargs): ---------- ascending : boolean, default True Sort ascending. Passing False sorts descending + inplace : boolean, default False + Do operation in place. na_position : {'first', 'last'} (optional, default='last') 'first' puts NaNs at the beginning 'last' puts NaNs at the end - inplace : boolean, default False - Do operation in place. Returns ------- @@ -932,3 +925,20 @@ def describe(self): result.index.name = 'levels' result.columns = ['counts','freqs'] return result + +##### utility routines ##### + +def _get_codes_for_values(values, levels): + """" + utility routine to turn values into codes given the specified levels + """ + + from pandas.core.algorithms import _get_data_algo, _hashtables + if values.dtype != levels.dtype: + values = com._ensure_object(values) + levels = com._ensure_object(levels) + (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables) + t = hash_klass(len(levels)) + t.map_locations(levels) + return com._ensure_platform_int(t.lookup(values)) + diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 48c3b4ece1d95..9659d4c3bd6e0 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -3415,34 +3415,38 @@ def _lexsort_indexer(keys, orders=None, na_position='last'): orders = [True] * len(keys) for key, order in zip(keys, orders): - key = np.asanyarray(key) - rizer = _hash.Factorizer(len(key)) - if not key.dtype == np.object_: - key = key.astype('O') + # we are already a Categorical + if is_categorical_dtype(key): + c = key - # factorize maps nans to na_sentinel=-1 - ids = rizer.factorize(key, sort=True) - n = len(rizer.uniques) - mask = (ids == -1) + # create the Categorical + else: + c = Categorical(key,ordered=True) + + if na_position not in ['last','first']: + raise ValueError('invalid na_position: {!r}'.format(na_position)) + + n = len(c.levels) + codes = c.codes.copy() + + mask = (c.codes == -1) if order: # ascending if na_position == 'last': - ids = np.where(mask, n, ids) + codes = np.where(mask, n, codes) elif na_position == 'first': - ids += 1 - else: - raise ValueError('invalid na_position: {!r}'.format(na_position)) + codes += 1 else: # not order means descending if na_position == 'last': - ids = np.where(mask, n, n-ids-1) + codes = np.where(mask, n, n-codes-1) elif na_position == 'first': - ids = np.where(mask, 0, n-ids) - else: - raise ValueError('invalid na_position: {!r}'.format(na_position)) + codes = np.where(mask, 0, n-codes) if mask.any(): n += 1 + shape.append(n) - labels.append(ids) + labels.append(codes) + return _indexer_from_factorized(labels, shape) def _nargsort(items, kind='quicksort', ascending=True, na_position='last'): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 98e8d4f88104f..23ba06938825d 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -451,9 +451,9 @@ def to_native_types(self, slicer=None, na_rep='', **kwargs): values[mask] = na_rep return values.tolist() - def _validate_merge(self, blocks): - """ validate that we can merge these blocks """ - return True + def _concat_blocks(self, blocks, values): + """ return the block concatenation """ + return self._holder(values[0]) # block actions #### def copy(self, deep=True): @@ -1639,15 +1639,19 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None, ndim=self.ndim, placement=self.mgr_locs) - def _validate_merge(self, blocks): - """ validate that we can merge these blocks """ + def _concat_blocks(self, blocks, values): + """ + validate that we can merge these blocks + + return the block concatenation + """ levels = self.values.levels for b in blocks: if not levels.equals(b.values.levels): raise ValueError("incompatible levels in categorical block merge") - return True + return self._holder(values[0], levels=levels) def to_native_types(self, slicer=None, na_rep='', **kwargs): """ convert to our native types format, slicing if desired """ @@ -4026,17 +4030,11 @@ def concatenate_join_units(join_units, concat_axis, copy): else: concat_values = com._concat_compat(to_concat, axis=concat_axis) - # FIXME: optimization potential: if len(join_units) == 1, single join unit - # is densified and sparsified back. if any(unit.needs_block_conversion for unit in join_units): # need to ask the join unit block to convert to the underlying repr for us blocks = [ unit.block for unit in join_units if unit.block is not None ] - - # may need to validate this combination - blocks[0]._validate_merge(blocks) - - return blocks[0]._holder(concat_values[0]) + return blocks[0]._concat_blocks(blocks, concat_values) else: return concat_values diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index b70e50eb3d030..642912805d06d 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -983,6 +983,47 @@ def f(): df.sort(columns=["unsort"], ascending=False) self.assertRaises(TypeError, f) + # multi-columns sort + # GH 7848 + df = DataFrame({"id":[6,5,4,3,2,1], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']}) + df["grade"] = pd.Categorical(df["raw_grade"]) + df['grade'].cat.reorder_levels(['b', 'e', 'a']) + + # sorts 'grade' according to the order of the levels + result = df.sort(columns=['grade']) + expected = df.iloc[[1,2,5,0,3,4]] + tm.assert_frame_equal(result,expected) + + # multi + result = df.sort(columns=['grade', 'id']) + expected = df.iloc[[2,1,5,4,3,0]] + tm.assert_frame_equal(result,expected) + + # reverse + cat = Categorical(["a","c","c","b","d"], ordered=True) + res = cat.order(ascending=False) + exp_val = np.array(["d","c", "c", "b","a"],dtype=object) + exp_levels = np.array(["a","b","c","d"],dtype=object) + self.assert_numpy_array_equal(res.__array__(), exp_val) + self.assert_numpy_array_equal(res.levels, exp_levels) + + # some NaN positions + + cat = Categorical(["a","c","b","d", np.nan], ordered=True) + res = cat.order(ascending=False, na_position='last') + exp_val = np.array(["d","c","b","a", np.nan],dtype=object) + exp_levels = np.array(["a","b","c","d"],dtype=object) + # FIXME: IndexError: Out of bounds on buffer access (axis 0) + #self.assert_numpy_array_equal(res.__array__(), exp_val) + #self.assert_numpy_array_equal(res.levels, exp_levels) + + cat = Categorical(["a","c","b","d", np.nan], ordered=True) + res = cat.order(ascending=False, na_position='first') + exp_val = np.array([np.nan, "d","c","b","a"],dtype=object) + exp_levels = np.array(["a","b","c","d"],dtype=object) + # FIXME: IndexError: Out of bounds on buffer access (axis 0) + #self.assert_numpy_array_equal(res.__array__(), exp_val) + #self.assert_numpy_array_equal(res.levels, exp_levels) def test_slicing(self): cat = Series(Categorical([1,2,3,4])) @@ -1429,6 +1470,22 @@ def f(): pd.concat([df,df_wrong_levels]) self.assertRaises(ValueError, f) + # GH 7864 + # make sure ordering is preserverd + df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']}) + df["grade"] = pd.Categorical(df["raw_grade"]) + df['grade'].cat.reorder_levels(['e', 'a', 'b']) + + df1 = df[0:3] + df2 = df[3:] + + self.assert_numpy_array_equal(df['grade'].cat.levels, df1['grade'].cat.levels) + self.assert_numpy_array_equal(df['grade'].cat.levels, df2['grade'].cat.levels) + + dfx = pd.concat([df1, df2]) + dfx['grade'].cat.levels + self.assert_numpy_array_equal(df['grade'].cat.levels, dfx['grade'].cat.levels) + def test_append(self): cat = pd.Categorical(["a","b"], levels=["a","b"]) vals = [1,2]