Merge pull request #7850 from jreback/cat_sort

jreback · jreback · commit 9857a0eb52c2 · 2014-07-29T18:44:36.000-04:00
BUG: fix multi-column sort that includes Categoricals / concat (GH7848/GH7864)
diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt
@@ -116,7 +116,8 @@ Categoricals in Series/DataFrame
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 :class:`~pandas.Categorical` can now be included in `Series` and `DataFrames` and gained new
-methods to manipulate. Thanks to Jan Schultz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`, :issue:`7444`, :issue:`7839`).
+methods to manipulate. Thanks to Jan Schultz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`,
+:issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`).
 
 For full docs, see the :ref:`Categorical introduction <categorical>` and the :ref:`API documentation <api.categorical>`.
 
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -16,7 +16,6 @@
 from pandas.core.config import get_option
 from pandas.core import format as fmt
 
-
 def _cat_compare_op(op):
     def f(self, other):
         if isinstance(other, (Categorical, np.ndarray)):
@@ -45,16 +44,6 @@ def _maybe_to_categorical(array):
     return array
 
 
-def _get_codes_for_values(values, levels):
-    from pandas.core.algorithms import _get_data_algo, _hashtables
-    if values.dtype != levels.dtype:
-        values = com._ensure_object(values)
-        levels = com._ensure_object(levels)
-    (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables)
-    t = hash_klass(len(levels))
-    t.map_locations(levels)
-    return com._ensure_platform_int(t.lookup(values))
-
 _codes_doc = """The level codes of this categorical.
 
 Level codes are an array if integer which are the positions of the real
@@ -484,7 +473,7 @@ def argsort(self, ascending=True, **kwargs):
             result = result[::-1]
         return result
 
-    def order(self, inplace=False, ascending=True, **kwargs):
+    def order(self, inplace=False, ascending=True, na_position='last', **kwargs):
         """ Sorts the Category by level value returning a new Categorical by default.
 
         Only ordered Categoricals can be sorted!
@@ -495,11 +484,11 @@ def order(self, inplace=False, ascending=True, **kwargs):
         ----------
         ascending : boolean, default True
             Sort ascending. Passing False sorts descending
+        inplace : boolean, default False
+            Do operation in place.
         na_position : {'first', 'last'} (optional, default='last')
             'first' puts NaNs at the beginning
             'last' puts NaNs at the end
-        inplace : boolean, default False
-            Do operation in place.
 
         Returns
         -------
@@ -511,18 +500,22 @@ def order(self, inplace=False, ascending=True, **kwargs):
         """
         if not self.ordered:
             raise TypeError("Categorical not ordered")
-        _sorted = np.sort(self._codes.copy())
+        if na_position not in ['last','first']:
+            raise ValueError('invalid na_position: {!r}'.format(na_position))
+
+        codes = np.sort(self._codes.copy())
         if not ascending:
-            _sorted = _sorted[::-1]
+            codes = codes[::-1]
+
         if inplace:
-            self._codes = _sorted
+            self._codes = codes
             return
         else:
-            return Categorical(values=_sorted,levels=self.levels, ordered=self.ordered,
+            return Categorical(values=codes,levels=self.levels, ordered=self.ordered,
                                name=self.name, fastpath=True)
 
 
-    def sort(self, inplace=True, ascending=True, **kwargs):
+    def sort(self, inplace=True, ascending=True, na_position='last', **kwargs):
         """ Sorts the Category inplace by level value.
 
         Only ordered Categoricals can be sorted!
@@ -533,11 +526,11 @@ def sort(self, inplace=True, ascending=True, **kwargs):
         ----------
         ascending : boolean, default True
             Sort ascending. Passing False sorts descending
+        inplace : boolean, default False
+            Do operation in place.
         na_position : {'first', 'last'} (optional, default='last')
             'first' puts NaNs at the beginning
             'last' puts NaNs at the end
-        inplace : boolean, default False
-            Do operation in place.
 
         Returns
         -------
@@ -932,3 +925,20 @@ def describe(self):
         result.index.name = 'levels'
         result.columns = ['counts','freqs']
         return result
+
+##### utility routines #####
+
+def _get_codes_for_values(values, levels):
+    """"
+    utility routine to turn values into codes given the specified levels
+    """
+
+    from pandas.core.algorithms import _get_data_algo, _hashtables
+    if values.dtype != levels.dtype:
+        values = com._ensure_object(values)
+        levels = com._ensure_object(levels)
+    (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables)
+    t = hash_klass(len(levels))
+    t.map_locations(levels)
+    return com._ensure_platform_int(t.lookup(values))
+
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -3415,34 +3415,38 @@ def _lexsort_indexer(keys, orders=None, na_position='last'):
         orders = [True] * len(keys)
 
     for key, order in zip(keys, orders):
-        key = np.asanyarray(key)
-        rizer = _hash.Factorizer(len(key))
 
-        if not key.dtype == np.object_:
-            key = key.astype('O')
+        # we are already a Categorical
+        if is_categorical_dtype(key):
+            c = key
 
-        # factorize maps nans to na_sentinel=-1
-        ids = rizer.factorize(key, sort=True)
-        n = len(rizer.uniques)
-        mask = (ids == -1)
+        # create the Categorical
+        else:
+            c = Categorical(key,ordered=True)
+
+        if na_position not in ['last','first']:
+            raise ValueError('invalid na_position: {!r}'.format(na_position))
+
+        n = len(c.levels)
+        codes = c.codes.copy()
+
+        mask = (c.codes == -1)
         if order: # ascending
             if na_position == 'last':
-                ids = np.where(mask, n, ids)
+                codes = np.where(mask, n, codes)
             elif na_position == 'first':
-                ids += 1
-            else:
-                raise ValueError('invalid na_position: {!r}'.format(na_position))
+                codes += 1
         else: # not order means descending
             if na_position == 'last':
-                ids = np.where(mask, n, n-ids-1)
+                codes = np.where(mask, n, n-codes-1)
             elif na_position == 'first':
-                ids = np.where(mask, 0, n-ids)
-            else:
-                raise ValueError('invalid na_position: {!r}'.format(na_position))
+                codes = np.where(mask, 0, n-codes)
         if mask.any():
             n += 1
+
         shape.append(n)
-        labels.append(ids)
+        labels.append(codes)
+
     return _indexer_from_factorized(labels, shape)
 
 def _nargsort(items, kind='quicksort', ascending=True, na_position='last'):
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -451,9 +451,9 @@ def to_native_types(self, slicer=None, na_rep='', **kwargs):
         values[mask] = na_rep
         return values.tolist()
 
-    def _validate_merge(self, blocks):
-        """ validate that we can merge these blocks """
-        return True
+    def _concat_blocks(self, blocks, values):
+        """ return the block concatenation """
+        return self._holder(values[0])
 
     # block actions ####
     def copy(self, deep=True):
@@ -1639,15 +1639,19 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None,
                           ndim=self.ndim,
                           placement=self.mgr_locs)
 
-    def _validate_merge(self, blocks):
-        """ validate that we can merge these blocks """
+    def _concat_blocks(self, blocks, values):
+        """
+        validate that we can merge these blocks
+
+        return the block concatenation
+        """
 
         levels = self.values.levels
         for b in blocks:
             if not levels.equals(b.values.levels):
                 raise ValueError("incompatible levels in categorical block merge")
 
-        return True
+        return self._holder(values[0], levels=levels)
 
     def to_native_types(self, slicer=None, na_rep='', **kwargs):
         """ convert to our native types format, slicing if desired """
@@ -4026,17 +4030,11 @@ def concatenate_join_units(join_units, concat_axis, copy):
     else:
         concat_values = com._concat_compat(to_concat, axis=concat_axis)
 
-    # FIXME: optimization potential: if len(join_units) == 1, single join unit
-    # is densified and sparsified back.
     if any(unit.needs_block_conversion for unit in join_units):
 
         # need to ask the join unit block to convert to the underlying repr for us
         blocks = [ unit.block for unit in join_units if unit.block is not None ]
-
-        # may need to validate this combination
-        blocks[0]._validate_merge(blocks)
-
-        return blocks[0]._holder(concat_values[0])
+        return blocks[0]._concat_blocks(blocks, concat_values)
     else:
         return concat_values
 
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -983,6 +983,47 @@ def f():
             df.sort(columns=["unsort"], ascending=False)
         self.assertRaises(TypeError, f)
 
+        # multi-columns sort
+        # GH 7848
+        df = DataFrame({"id":[6,5,4,3,2,1], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})
+        df["grade"] = pd.Categorical(df["raw_grade"])
+        df['grade'].cat.reorder_levels(['b', 'e', 'a'])
+
+        # sorts 'grade' according to the order of the levels
+        result = df.sort(columns=['grade'])
+        expected = df.iloc[[1,2,5,0,3,4]]
+        tm.assert_frame_equal(result,expected)
+
+        # multi
+        result = df.sort(columns=['grade', 'id'])
+        expected = df.iloc[[2,1,5,4,3,0]]
+        tm.assert_frame_equal(result,expected)
+
+        # reverse
+        cat = Categorical(["a","c","c","b","d"], ordered=True)
+        res = cat.order(ascending=False)
+        exp_val = np.array(["d","c", "c", "b","a"],dtype=object)
+        exp_levels = np.array(["a","b","c","d"],dtype=object)
+        self.assert_numpy_array_equal(res.__array__(), exp_val)
+        self.assert_numpy_array_equal(res.levels, exp_levels)
+
+        # some NaN positions
+
+        cat = Categorical(["a","c","b","d", np.nan], ordered=True)
+        res = cat.order(ascending=False, na_position='last')
+        exp_val = np.array(["d","c","b","a", np.nan],dtype=object)
+        exp_levels = np.array(["a","b","c","d"],dtype=object)
+        # FIXME: IndexError: Out of bounds on buffer access (axis 0)
+        #self.assert_numpy_array_equal(res.__array__(), exp_val)
+        #self.assert_numpy_array_equal(res.levels, exp_levels)
+
+        cat = Categorical(["a","c","b","d", np.nan], ordered=True)
+        res = cat.order(ascending=False, na_position='first')
+        exp_val = np.array([np.nan, "d","c","b","a"],dtype=object)
+        exp_levels = np.array(["a","b","c","d"],dtype=object)
+        # FIXME: IndexError: Out of bounds on buffer access (axis 0)
+        #self.assert_numpy_array_equal(res.__array__(), exp_val)
+        #self.assert_numpy_array_equal(res.levels, exp_levels)
 
     def test_slicing(self):
         cat = Series(Categorical([1,2,3,4]))
@@ -1429,6 +1470,22 @@ def f():
             pd.concat([df,df_wrong_levels])
         self.assertRaises(ValueError, f)
 
+        # GH 7864
+        # make sure ordering is preserverd
+        df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})
+        df["grade"] = pd.Categorical(df["raw_grade"])
+        df['grade'].cat.reorder_levels(['e', 'a', 'b'])
+
+        df1 = df[0:3]
+        df2 = df[3:]
+
+        self.assert_numpy_array_equal(df['grade'].cat.levels, df1['grade'].cat.levels)
+        self.assert_numpy_array_equal(df['grade'].cat.levels, df2['grade'].cat.levels)
+
+        dfx = pd.concat([df1, df2])
+        dfx['grade'].cat.levels
+        self.assert_numpy_array_equal(df['grade'].cat.levels, dfx['grade'].cat.levels)
+
     def test_append(self):
         cat = pd.Categorical(["a","b"], levels=["a","b"])
         vals = [1,2]