Merge pull request #10508 from sinhrks/groupby_dtcat

sinhrks · sinhrks · commit c06f9ce1b98e · 2015-07-29T00:28:20.000+09:00
BUG: Groupby(sort=False) with datetime-like Categorical raises ValueError
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -303,7 +303,22 @@ Other API Changes
 - Allow passing `kwargs` to the interpolation methods (:issue:`10378`).
 - Serialize metadata properties of subclasses of pandas objects (:issue:`10553`).
 - ``Categorical.name`` was removed to make `Categorical` more ``numpy.ndarray`` like. Use ``Series(cat, name="whatever")`` instead (:issue:`10482`).
+- ``Categorical.unique`` now returns new ``Categorical`` which ``categories`` and ``codes`` are unique, rather than returnning ``np.array`` (:issue:`10508`)
 
+   - unordered category: values and categories are sorted by appearance order.
+   - ordered category: values are sorted by appearance order, categories keeps existing order.
+
+.. ipython :: python
+
+   cat = pd.Categorical(['C', 'A', 'B', 'C'], categories=['A', 'B', 'C'], ordered=True)
+   cat
+   cat.unique()
+
+   cat = pd.Categorical(['C', 'A', 'B', 'C'], categories=['A', 'B', 'C'])
+   cat
+   cat.unique()
+
+- ``groupby`` using ``Categorical`` follows the same rule as ``Categorical.unique`` described above  (:issue:`10508`)
 - ``NaT``'s methods now either raise ``ValueError``, or return ``np.nan`` or ``NaT`` (:issue:`9513`)
 
   ===============================     ==============================================================
@@ -365,6 +380,9 @@ Bug Fixes
 - Bug in ``DataFrame.interpolate`` with ``axis=1`` and ``inplace=True`` (:issue:`10395`)
 - Bug in ``io.sql.get_schema`` when specifying multiple columns as primary
   key (:issue:`10385`).
+
+- Bug in ``groupby(sort=False)`` with datetime-like ``Categorical`` raises ``ValueError`` (:issue:`10505`)
+
 - Bug in ``test_categorical`` on big-endian builds (:issue:`10425`)
 - Bug in ``Series.shift`` and ``DataFrame.shift`` not supporting categorical data (:issue:`9416`)
 - Bug in ``Series.map`` using categorical ``Series`` raises ``AttributeError`` (:issue:`10324`)
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -1558,19 +1558,30 @@ def mode(self):
 
     def unique(self):
         """
-        Return the unique values.
+        Return the ``Categorical`` which ``categories`` and ``codes`` are unique.
+        Unused categories are NOT returned.
 
-        Unused categories are NOT returned. Unique values are returned in order
-        of appearance.
+        - unordered category: values and categories are sorted by appearance
+          order.
+        - ordered category: values are sorted by appearance order, categories
+          keeps existing order.
 
         Returns
         -------
-        unique values : array
+        unique values : ``Categorical``
         """
+
         from pandas.core.nanops import unique1d
         # unlike np.unique, unique1d does not sort
         unique_codes = unique1d(self.codes)
-        return take_1d(self.categories.values, unique_codes)
+        cat = self.copy()
+        # keep nan in codes
+        cat._codes = unique_codes
+        # exclude nan from indexer for categories
+        take_codes = unique_codes[unique_codes != -1]
+        if self.ordered:
+            take_codes = sorted(take_codes)
+        return cat.set_categories(cat.categories.take(take_codes))
 
     def equals(self, other):
         """
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -1959,7 +1959,8 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
 
                 # fix bug #GH8868 sort=False being ignored in categorical groupby
                 else:
-                    self.grouper = self.grouper.reorder_categories(self.grouper.unique())
+                    cat = self.grouper.unique()
+                    self.grouper = self.grouper.reorder_categories(cat.categories)
 
                 # we make a CategoricalIndex out of the cat grouper
                 # preserving the categories / ordered attributes
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -958,20 +958,59 @@ def test_min_max(self):
         self.assertEqual(_max, 1)
 
     def test_unique(self):
-        cat = Categorical(["a","b"])
-        exp = np.asarray(["a","b"])
+        # categories are reordered based on value when ordered=False
+        cat = Categorical(["a", "b"])
+        exp = np.asarray(["a", "b"])
         res = cat.unique()
         self.assert_numpy_array_equal(res, exp)
 
-        cat = Categorical(["a","b","a","a"], categories=["a","b","c"])
+        cat = Categorical(["a", "b", "a", "a"], categories=["a", "b", "c"])
         res = cat.unique()
         self.assert_numpy_array_equal(res, exp)
+        tm.assert_categorical_equal(res, Categorical(exp))
 
-        # unique should not sort
-        cat = Categorical(["b", "b", np.nan, "a"], categories=["a","b","c"])
+        cat = Categorical(["c", "a", "b", "a", "a"], categories=["a", "b", "c"])
+        exp = np.asarray(["c", "a", "b"])
+        res = cat.unique()
+        self.assert_numpy_array_equal(res, exp)
+        tm.assert_categorical_equal(res, Categorical(exp, categories=['c', 'a', 'b']))
+
+        # nan must be removed
+        cat = Categorical(["b", np.nan, "b", np.nan, "a"], categories=["a", "b", "c"])
         res = cat.unique()
         exp = np.asarray(["b", np.nan, "a"], dtype=object)
         self.assert_numpy_array_equal(res, exp)
+        tm.assert_categorical_equal(res, Categorical(["b", np.nan, "a"], categories=["b", "a"]))
+
+    def test_unique_ordered(self):
+        # keep categories order when ordered=True
+        cat = Categorical(['b', 'a', 'b'], categories=['a', 'b'], ordered=True)
+        res = cat.unique()
+        exp = np.asarray(['b', 'a'])
+        exp_cat = Categorical(exp, categories=['a', 'b'], ordered=True)
+        self.assert_numpy_array_equal(res, exp)
+        tm.assert_categorical_equal(res, exp_cat)
+
+        cat = Categorical(['c', 'b', 'a', 'a'], categories=['a', 'b', 'c'], ordered=True)
+        res = cat.unique()
+        exp = np.asarray(['c', 'b', 'a'])
+        exp_cat = Categorical(exp, categories=['a', 'b', 'c'], ordered=True)
+        self.assert_numpy_array_equal(res, exp)
+        tm.assert_categorical_equal(res, exp_cat)
+
+        cat = Categorical(['b', 'a', 'a'], categories=['a', 'b', 'c'], ordered=True)
+        res = cat.unique()
+        exp = np.asarray(['b', 'a'])
+        exp_cat = Categorical(exp, categories=['a', 'b'], ordered=True)
+        self.assert_numpy_array_equal(res, exp)
+        tm.assert_categorical_equal(res, exp_cat)
+
+        cat = Categorical(['b', 'b', np.nan, 'a'], categories=['a', 'b', 'c'], ordered=True)
+        res = cat.unique()
+        exp = np.asarray(['b', np.nan, 'a'], dtype=object)
+        exp_cat = Categorical(exp, categories=['a', 'b'], ordered=True)
+        self.assert_numpy_array_equal(res, exp)
+        tm.assert_categorical_equal(res, exp_cat)
 
     def test_mode(self):
         s = Categorical([1,1,2,4,5,5,5], categories=[5,4,3,2,1], ordered=True)
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -3413,7 +3413,8 @@ def test_groupby_sort_categorical(self):
 
         col = 'range'
         assert_frame_equal(result_sort, df.groupby(col, sort=True).first())
-        assert_frame_equal(result_nosort, df.groupby(col, sort=False).first())
+        # when categories is ordered, group is ordered by category's order
+        assert_frame_equal(result_sort, df.groupby(col, sort=False).first())
 
         df['range'] = Categorical(df['range'],ordered=False)
         index = Index(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', '(7.5, 10]'], dtype='object')
@@ -3431,6 +3432,55 @@ def test_groupby_sort_categorical(self):
         assert_frame_equal(result_sort, df.groupby(col, sort=True).first())
         assert_frame_equal(result_nosort, df.groupby(col, sort=False).first())
 
+    def test_groupby_sort_categorical_datetimelike(self):
+        # GH10505
+
+        # use same data as test_groupby_sort_categorical, which category is
+        # corresponding to datetime.month
+        df = DataFrame({'dt': [datetime(2011, 7, 1), datetime(2011, 7, 1),
+                               datetime(2011, 2, 1), datetime(2011, 5, 1),
+                               datetime(2011, 2, 1), datetime(2011, 1, 1),
+                               datetime(2011, 5, 1)],
+                        'foo': [10, 8, 5, 6, 4, 1, 7],
+                        'bar': [10, 20, 30, 40, 50, 60, 70]},
+                       columns=['dt', 'foo', 'bar'])
+
+        # ordered=True
+        df['dt'] = Categorical(df['dt'], ordered=True)
+        index = [datetime(2011, 1, 1), datetime(2011, 2, 1),
+                 datetime(2011, 5, 1), datetime(2011, 7, 1)]
+        result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar'])
+        result_sort.index = CategoricalIndex(index, name='dt', ordered=True)
+
+        index = [datetime(2011, 7, 1), datetime(2011, 2, 1),
+                 datetime(2011, 5, 1), datetime(2011, 1, 1)]
+        result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
+                                  columns=['foo', 'bar'])
+        result_nosort.index = CategoricalIndex(index, categories=index,
+                                               name='dt', ordered=True)
+
+        col = 'dt'
+        assert_frame_equal(result_sort, df.groupby(col, sort=True).first())
+        # when categories is ordered, group is ordered by category's order
+        assert_frame_equal(result_sort, df.groupby(col, sort=False).first())
+
+        # ordered = False
+        df['dt'] = Categorical(df['dt'], ordered=False)
+        index = [datetime(2011, 1, 1), datetime(2011, 2, 1),
+                 datetime(2011, 5, 1), datetime(2011, 7, 1)]
+        result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar'])
+        result_sort.index = CategoricalIndex(index, name='dt')
+
+        index = [datetime(2011, 7, 1), datetime(2011, 2, 1),
+                 datetime(2011, 5, 1), datetime(2011, 1, 1)]
+        result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
+                                  columns=['foo', 'bar'])
+        result_nosort.index = CategoricalIndex(index, categories=index, name='dt')
+
+        col = 'dt'
+        assert_frame_equal(result_sort, df.groupby(col, sort=True).first())
+        assert_frame_equal(result_nosort, df.groupby(col, sort=False).first())
+
 
     def test_groupby_sort_multiindex_series(self):
         # series multiindex groupby sort argument was not being passed through _compress_group_index