From 5a4dafb6d347107480c38903675017415539ff36 Mon Sep 17 00:00:00 2001 From: David Cottrell Date: Fri, 13 Feb 2015 22:28:25 +0000 Subject: [PATCH] Add test and patch to fix bug GH8868 (groupby sort categorical). --- doc/source/whatsnew/v0.16.0.txt | 1 + pandas/core/groupby.py | 3 +++ pandas/tests/test_groupby.py | 24 ++++++++++++++++++++++++ 3 files changed, 28 insertions(+) diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index c12513e087619..e75e7d8d23d13 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -307,3 +307,4 @@ Bug Fixes - Bug in ``read_csv`` with buffer overflows with certain malformed input files (:issue:`9205`) - Bug in groupby MultiIndex with missing pair (:issue:`9049`, :issue:`9344`) - Fixed bug in ``Series.groupby`` where grouping on ``MultiIndex`` levels would ignore the sort argument (:issue:`9444`) +- Fix bug in ``DataFrame.Groupby`` where sort=False is ignored in case of Categorical columns. (:issue:`8868`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 2e8b09bc5771b..0be046bbdec42 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1925,6 +1925,9 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, # a passed Categorical elif isinstance(self.grouper, Categorical): + # fix bug #GH8868 sort=False being ignored in categorical groupby + if not self.sort: + self.grouper = self.grouper.reorder_categories(self.grouper.unique()) self._labels = self.grouper.codes self._group_index = self.grouper.categories if self.name is None: diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index f2ea17db44211..9534bc5dd2e7c 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -3265,6 +3265,30 @@ def test_no_dummy_key_names(self): self.df['B'].values]).sum() self.assertEqual(result.index.names, (None, None)) + def test_groupby_sort_categorical(self): + # dataframe groupby sort was being ignored # GH 8868 + df = DataFrame([['(7.5, 10]', 10, 10], + ['(7.5, 10]', 8, 20], + ['(2.5, 5]', 5, 30], + ['(5, 7.5]', 6, 40], + ['(2.5, 5]', 4, 50], + ['(0, 2.5]', 1, 60], + ['(5, 7.5]', 7, 70]], columns=['range', 'foo', 'bar']) + df['range'] = Categorical(df['range']) + index = Index(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', '(7.5, 10]'], dtype='object') + index.name = 'range' + result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar']) + result_sort.index = index + index = Index(['(7.5, 10]', '(2.5, 5]', '(5, 7.5]', '(0, 2.5]'], dtype='object') + index.name = 'range' + result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], index=index, columns=['foo', 'bar']) + result_nosort.index = index + + col = 'range' + assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) + assert_frame_equal(result_nosort, df.groupby(col, sort=False).first()) + + def test_groupby_sort_multiindex_series(self): # series multiindex groupby sort argument was not being passed through _compress_group_index # GH 9444