Skip to content

Commit c06f9ce

Browse files
committed
Merge pull request #10508 from sinhrks/groupby_dtcat
BUG: Groupby(sort=False) with datetime-like Categorical raises ValueError
2 parents a743743 + 29f1f42 commit c06f9ce

File tree

5 files changed

+131
-12
lines changed

5 files changed

+131
-12
lines changed

doc/source/whatsnew/v0.17.0.txt

+18
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,22 @@ Other API Changes
303303
- Allow passing `kwargs` to the interpolation methods (:issue:`10378`).
304304
- Serialize metadata properties of subclasses of pandas objects (:issue:`10553`).
305305
- ``Categorical.name`` was removed to make `Categorical` more ``numpy.ndarray`` like. Use ``Series(cat, name="whatever")`` instead (:issue:`10482`).
306+
- ``Categorical.unique`` now returns new ``Categorical`` which ``categories`` and ``codes`` are unique, rather than returnning ``np.array`` (:issue:`10508`)
306307

308+
- unordered category: values and categories are sorted by appearance order.
309+
- ordered category: values are sorted by appearance order, categories keeps existing order.
310+
311+
.. ipython :: python
312+
313+
cat = pd.Categorical(['C', 'A', 'B', 'C'], categories=['A', 'B', 'C'], ordered=True)
314+
cat
315+
cat.unique()
316+
317+
cat = pd.Categorical(['C', 'A', 'B', 'C'], categories=['A', 'B', 'C'])
318+
cat
319+
cat.unique()
320+
321+
- ``groupby`` using ``Categorical`` follows the same rule as ``Categorical.unique`` described above (:issue:`10508`)
307322
- ``NaT``'s methods now either raise ``ValueError``, or return ``np.nan`` or ``NaT`` (:issue:`9513`)
308323

309324
=============================== ==============================================================
@@ -365,6 +380,9 @@ Bug Fixes
365380
- Bug in ``DataFrame.interpolate`` with ``axis=1`` and ``inplace=True`` (:issue:`10395`)
366381
- Bug in ``io.sql.get_schema`` when specifying multiple columns as primary
367382
key (:issue:`10385`).
383+
384+
- Bug in ``groupby(sort=False)`` with datetime-like ``Categorical`` raises ``ValueError`` (:issue:`10505`)
385+
368386
- Bug in ``test_categorical`` on big-endian builds (:issue:`10425`)
369387
- Bug in ``Series.shift`` and ``DataFrame.shift`` not supporting categorical data (:issue:`9416`)
370388
- Bug in ``Series.map`` using categorical ``Series`` raises ``AttributeError`` (:issue:`10324`)

pandas/core/categorical.py

+16-5
Original file line numberDiff line numberDiff line change
@@ -1558,19 +1558,30 @@ def mode(self):
15581558

15591559
def unique(self):
15601560
"""
1561-
Return the unique values.
1561+
Return the ``Categorical`` which ``categories`` and ``codes`` are unique.
1562+
Unused categories are NOT returned.
15621563
1563-
Unused categories are NOT returned. Unique values are returned in order
1564-
of appearance.
1564+
- unordered category: values and categories are sorted by appearance
1565+
order.
1566+
- ordered category: values are sorted by appearance order, categories
1567+
keeps existing order.
15651568
15661569
Returns
15671570
-------
1568-
unique values : array
1571+
unique values : ``Categorical``
15691572
"""
1573+
15701574
from pandas.core.nanops import unique1d
15711575
# unlike np.unique, unique1d does not sort
15721576
unique_codes = unique1d(self.codes)
1573-
return take_1d(self.categories.values, unique_codes)
1577+
cat = self.copy()
1578+
# keep nan in codes
1579+
cat._codes = unique_codes
1580+
# exclude nan from indexer for categories
1581+
take_codes = unique_codes[unique_codes != -1]
1582+
if self.ordered:
1583+
take_codes = sorted(take_codes)
1584+
return cat.set_categories(cat.categories.take(take_codes))
15741585

15751586
def equals(self, other):
15761587
"""

pandas/core/groupby.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1959,7 +1959,8 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
19591959

19601960
# fix bug #GH8868 sort=False being ignored in categorical groupby
19611961
else:
1962-
self.grouper = self.grouper.reorder_categories(self.grouper.unique())
1962+
cat = self.grouper.unique()
1963+
self.grouper = self.grouper.reorder_categories(cat.categories)
19631964

19641965
# we make a CategoricalIndex out of the cat grouper
19651966
# preserving the categories / ordered attributes

pandas/tests/test_categorical.py

+44-5
Original file line numberDiff line numberDiff line change
@@ -958,20 +958,59 @@ def test_min_max(self):
958958
self.assertEqual(_max, 1)
959959

960960
def test_unique(self):
961-
cat = Categorical(["a","b"])
962-
exp = np.asarray(["a","b"])
961+
# categories are reordered based on value when ordered=False
962+
cat = Categorical(["a", "b"])
963+
exp = np.asarray(["a", "b"])
963964
res = cat.unique()
964965
self.assert_numpy_array_equal(res, exp)
965966

966-
cat = Categorical(["a","b","a","a"], categories=["a","b","c"])
967+
cat = Categorical(["a", "b", "a", "a"], categories=["a", "b", "c"])
967968
res = cat.unique()
968969
self.assert_numpy_array_equal(res, exp)
970+
tm.assert_categorical_equal(res, Categorical(exp))
969971

970-
# unique should not sort
971-
cat = Categorical(["b", "b", np.nan, "a"], categories=["a","b","c"])
972+
cat = Categorical(["c", "a", "b", "a", "a"], categories=["a", "b", "c"])
973+
exp = np.asarray(["c", "a", "b"])
974+
res = cat.unique()
975+
self.assert_numpy_array_equal(res, exp)
976+
tm.assert_categorical_equal(res, Categorical(exp, categories=['c', 'a', 'b']))
977+
978+
# nan must be removed
979+
cat = Categorical(["b", np.nan, "b", np.nan, "a"], categories=["a", "b", "c"])
972980
res = cat.unique()
973981
exp = np.asarray(["b", np.nan, "a"], dtype=object)
974982
self.assert_numpy_array_equal(res, exp)
983+
tm.assert_categorical_equal(res, Categorical(["b", np.nan, "a"], categories=["b", "a"]))
984+
985+
def test_unique_ordered(self):
986+
# keep categories order when ordered=True
987+
cat = Categorical(['b', 'a', 'b'], categories=['a', 'b'], ordered=True)
988+
res = cat.unique()
989+
exp = np.asarray(['b', 'a'])
990+
exp_cat = Categorical(exp, categories=['a', 'b'], ordered=True)
991+
self.assert_numpy_array_equal(res, exp)
992+
tm.assert_categorical_equal(res, exp_cat)
993+
994+
cat = Categorical(['c', 'b', 'a', 'a'], categories=['a', 'b', 'c'], ordered=True)
995+
res = cat.unique()
996+
exp = np.asarray(['c', 'b', 'a'])
997+
exp_cat = Categorical(exp, categories=['a', 'b', 'c'], ordered=True)
998+
self.assert_numpy_array_equal(res, exp)
999+
tm.assert_categorical_equal(res, exp_cat)
1000+
1001+
cat = Categorical(['b', 'a', 'a'], categories=['a', 'b', 'c'], ordered=True)
1002+
res = cat.unique()
1003+
exp = np.asarray(['b', 'a'])
1004+
exp_cat = Categorical(exp, categories=['a', 'b'], ordered=True)
1005+
self.assert_numpy_array_equal(res, exp)
1006+
tm.assert_categorical_equal(res, exp_cat)
1007+
1008+
cat = Categorical(['b', 'b', np.nan, 'a'], categories=['a', 'b', 'c'], ordered=True)
1009+
res = cat.unique()
1010+
exp = np.asarray(['b', np.nan, 'a'], dtype=object)
1011+
exp_cat = Categorical(exp, categories=['a', 'b'], ordered=True)
1012+
self.assert_numpy_array_equal(res, exp)
1013+
tm.assert_categorical_equal(res, exp_cat)
9751014

9761015
def test_mode(self):
9771016
s = Categorical([1,1,2,4,5,5,5], categories=[5,4,3,2,1], ordered=True)

pandas/tests/test_groupby.py

+51-1
Original file line numberDiff line numberDiff line change
@@ -3413,7 +3413,8 @@ def test_groupby_sort_categorical(self):
34133413

34143414
col = 'range'
34153415
assert_frame_equal(result_sort, df.groupby(col, sort=True).first())
3416-
assert_frame_equal(result_nosort, df.groupby(col, sort=False).first())
3416+
# when categories is ordered, group is ordered by category's order
3417+
assert_frame_equal(result_sort, df.groupby(col, sort=False).first())
34173418

34183419
df['range'] = Categorical(df['range'],ordered=False)
34193420
index = Index(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', '(7.5, 10]'], dtype='object')
@@ -3431,6 +3432,55 @@ def test_groupby_sort_categorical(self):
34313432
assert_frame_equal(result_sort, df.groupby(col, sort=True).first())
34323433
assert_frame_equal(result_nosort, df.groupby(col, sort=False).first())
34333434

3435+
def test_groupby_sort_categorical_datetimelike(self):
3436+
# GH10505
3437+
3438+
# use same data as test_groupby_sort_categorical, which category is
3439+
# corresponding to datetime.month
3440+
df = DataFrame({'dt': [datetime(2011, 7, 1), datetime(2011, 7, 1),
3441+
datetime(2011, 2, 1), datetime(2011, 5, 1),
3442+
datetime(2011, 2, 1), datetime(2011, 1, 1),
3443+
datetime(2011, 5, 1)],
3444+
'foo': [10, 8, 5, 6, 4, 1, 7],
3445+
'bar': [10, 20, 30, 40, 50, 60, 70]},
3446+
columns=['dt', 'foo', 'bar'])
3447+
3448+
# ordered=True
3449+
df['dt'] = Categorical(df['dt'], ordered=True)
3450+
index = [datetime(2011, 1, 1), datetime(2011, 2, 1),
3451+
datetime(2011, 5, 1), datetime(2011, 7, 1)]
3452+
result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar'])
3453+
result_sort.index = CategoricalIndex(index, name='dt', ordered=True)
3454+
3455+
index = [datetime(2011, 7, 1), datetime(2011, 2, 1),
3456+
datetime(2011, 5, 1), datetime(2011, 1, 1)]
3457+
result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
3458+
columns=['foo', 'bar'])
3459+
result_nosort.index = CategoricalIndex(index, categories=index,
3460+
name='dt', ordered=True)
3461+
3462+
col = 'dt'
3463+
assert_frame_equal(result_sort, df.groupby(col, sort=True).first())
3464+
# when categories is ordered, group is ordered by category's order
3465+
assert_frame_equal(result_sort, df.groupby(col, sort=False).first())
3466+
3467+
# ordered = False
3468+
df['dt'] = Categorical(df['dt'], ordered=False)
3469+
index = [datetime(2011, 1, 1), datetime(2011, 2, 1),
3470+
datetime(2011, 5, 1), datetime(2011, 7, 1)]
3471+
result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar'])
3472+
result_sort.index = CategoricalIndex(index, name='dt')
3473+
3474+
index = [datetime(2011, 7, 1), datetime(2011, 2, 1),
3475+
datetime(2011, 5, 1), datetime(2011, 1, 1)]
3476+
result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
3477+
columns=['foo', 'bar'])
3478+
result_nosort.index = CategoricalIndex(index, categories=index, name='dt')
3479+
3480+
col = 'dt'
3481+
assert_frame_equal(result_sort, df.groupby(col, sort=True).first())
3482+
assert_frame_equal(result_nosort, df.groupby(col, sort=False).first())
3483+
34343484

34353485
def test_groupby_sort_multiindex_series(self):
34363486
# series multiindex groupby sort argument was not being passed through _compress_group_index

0 commit comments

Comments
 (0)