From 0c550e6b70e17d10c8f296578ce24a033e1283bc Mon Sep 17 00:00:00 2001 From: Kernc Date: Fri, 17 Feb 2017 16:55:46 +0100 Subject: [PATCH 1/4] BUG: Fix .groupby(categorical, sort=False) failing --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/groupby.py | 5 +++++ pandas/tests/groupby/test_categorical.py | 24 ++++++++++++++++++++++++ 3 files changed, 30 insertions(+) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index ae4a3d3c3d97f..4a86d949c9700 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -539,6 +539,7 @@ Bug Fixes - Bug in ``resample``, where a non-string ```loffset`` argument would not be applied when resampling a timeseries (:issue:`13218`) +- Bug in ``.groupby`` where ```.groupby(categorical, sort=False)`` would raise ``ValueError`` due to non-matching categories (:issue:`13179`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index ba2de295fa0a9..72d1fc96547de 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2315,6 +2315,11 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, # groupby else: cat = self.grouper.unique() + all_categories = self.grouper.categories + cat.add_categories( + all_categories[ + ~all_categories.isin(cat.categories)], + inplace=True) # GH-13179 self.grouper = self.grouper.reorder_categories( cat.categories) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index eebd0e0f490c1..cfcb531bedab8 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -284,6 +284,30 @@ def test_groupby_multi_categorical_as_index(self): tm.assert_frame_equal(result, expected, check_index_type=True) + def test_groupby_preserve_categories(self): + # GH-13179 + categories = list('abc') + + # ordered=True + df = DataFrame({'A': pd.Categorical(list('ba'), + categories=categories, + ordered=True)}) + index = pd.CategoricalIndex(categories, categories, ordered=True) + tm.assert_index_equal(df.groupby('A', sort=True).first().index, index) + tm.assert_index_equal(df.groupby('A', sort=False).first().index, index) + + # ordered=False + df = DataFrame({'A': pd.Categorical(list('ba'), + categories=categories, + ordered=False)}) + sort_index = pd.CategoricalIndex(categories, categories, ordered=False) + nosort_index = pd.CategoricalIndex(list('bac'), list('bac'), + ordered=False) + tm.assert_index_equal(df.groupby('A', sort=True).first().index, + sort_index) + tm.assert_index_equal(df.groupby('A', sort=False).first().index, + nosort_index) + def test_groupby_preserve_categorical_dtype(self): # GH13743, GH13854 df = DataFrame({'A': [1, 2, 1, 1, 2], From c81314690363e95f983604daa87b6823921b2df5 Mon Sep 17 00:00:00 2001 From: Kernc Date: Fri, 17 Feb 2017 18:50:57 +0100 Subject: [PATCH 2/4] PERF: add asv for categorical grouping --- asv_bench/benchmarks/groupby.py | 37 +++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 03ff62568b405..59f55914ea4d3 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -492,6 +492,43 @@ def time_groupby_sum(self): self.df.groupby(['a'])['b'].sum() +class groupby_categorical(object): + goal_time = 0.2 + + def setup(self): + N = 100000 + arr = np.random.random(N) + + self.df = DataFrame(dict( + a=Categorical(np.random.randint(10000, size=N)), + b=arr)) + self.df_ordered = DataFrame(dict( + a=Categorical(np.random.randint(10000, size=N), ordered=True), + b=arr)) + self.df_extra_cat = DataFrame(dict( + a=Categorical(np.random.randint(100, size=N), + categories=np.arange(10000)), + b=arr)) + + def time_groupby_sort(self): + self.df.groupby('a')['b'].count() + + def time_groupby_nosort(self): + self.df.groupby('a', sort=False)['b'].count() + + def time_groupby_ordered_sort(self): + self.df_ordered.groupby('a')['b'].count() + + def time_groupby_ordered_nosort(self): + self.df_ordered.groupby('a', sort=False)['b'].count() + + def time_groupby_extra_cat_sort(self): + self.df_extra_cat.groupby('a')['b'].count() + + def time_groupby_extra_cat_nosort(self): + self.df_extra_cat.groupby('a', sort=False)['b'].count() + + class groupby_period(object): # GH 14338 goal_time = 0.2 From 2aec326c769f5b6c55458d629043ab638ab0ae42 Mon Sep 17 00:00:00 2001 From: Kernc Date: Sat, 18 Feb 2017 17:42:23 +0100 Subject: [PATCH 3/4] fixup! BUG: Fix .groupby(categorical, sort=False) failing --- doc/source/whatsnew/v0.20.0.txt | 38 +++++++++++++++++++++++++++++++-- pandas/core/categorical.py | 29 +++++++++++++++++++++++++ pandas/core/groupby.py | 23 +------------------- pandas/indexes/category.py | 4 ++++ 4 files changed, 70 insertions(+), 24 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 4a86d949c9700..8682d418f2b8f 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -120,6 +120,42 @@ Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937 - Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`) - Bug in ``pd.value_counts()`` in which unsigned 64-bit integers were being erroneously truncated in the output (:issue:`14934`) +.. _whatsnew_0200.enhancements.groupy_categorical + +GroupBy on Categoricals +^^^^^^^^^^^^^^^^^^^^^^^ + +In previous version, ``.groupby(..., sort=False)`` would fail with a ``ValueError`` when grouping on a categorical series with some categories not appearing in the data. (:issue:`13179`) + +Now, it works. + +.. ipython:: python + + chromosomes = np.r_[np.arange(1, 23).astype(str), ['X', 'Y']] + df = pd.DataFrame({ + 'A': np.random.randint(100), + 'B': np.random.randint(100), + 'C': np.random.randint(100), + 'chromosomes': pd.Categorical(np.random.choice(chromosomes, 100), + categories=chromosomes, + ordered=True)}) + +Previous Behavior: + +.. code-block:: ipython + + In [3]: df[df.chromosomes != '1'].groupby('chromosomes', sort=False).sum() + --------------------------------------------------------------------------- + ValueError Traceback (most recent call last) + ... + ValueError: items in new_categories are not the same as in old categories + +New Behavior: + +.. ipython:: python + + df[df.chromosomes != '1'].groupby('chromosomes', sort=False).sum() + .. _whatsnew_0200.enhancements.other: Other enhancements @@ -160,7 +196,6 @@ Other enhancements .. _whatsnew_0200.api_breaking: - Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -539,7 +574,6 @@ Bug Fixes - Bug in ``resample``, where a non-string ```loffset`` argument would not be applied when resampling a timeseries (:issue:`13218`) -- Bug in ``.groupby`` where ```.groupby(categorical, sort=False)`` would raise ``ValueError`` due to non-matching categories (:issue:`13179`) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 491db2e080953..60d72d69d346b 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -602,6 +602,35 @@ def _get_categories(self): categories = property(fget=_get_categories, fset=_set_categories, doc=_categories_doc) + def _codes_for_groupby(self, sort): + """ + Return a Categorical adjusted for groupby + + Parameters + ---------- + sort : boolean + The value of the sort paramter groupby was called with. + + Returns + ------- + Categorical + In case of sort=True, self is returned with original categories + preserved. In case of sort=False, the new categories are set + to the order of appearance in codes (unless ordered=True), + followed by any unrepresented categories in original order. + """ + cat = self + # sort=False should order groups in as-encountered order (GH-8868) + if not sort: + cat = self.unique() + # But all categories should be present, including those missing + # from the data (GH-13179), which .unique() dropped + cat.add_categories(self.categories[ + ~self.categories.isin(cat.categories)], + inplace=True) + cat = self.reorder_categories(cat.categories) + return cat + _ordered = None def set_ordered(self, value, inplace=False): diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 72d1fc96547de..0b3fcba1c1ba5 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2300,28 +2300,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, # a passed Categorical elif is_categorical_dtype(self.grouper): - # must have an ordered categorical - if self.sort: - if not self.grouper.ordered: - - # technically we cannot group on an unordered - # Categorical - # but this a user convenience to do so; the ordering - # is preserved and if it's a reduction it doesn't make - # any difference - pass - - # fix bug #GH8868 sort=False being ignored in categorical - # groupby - else: - cat = self.grouper.unique() - all_categories = self.grouper.categories - cat.add_categories( - all_categories[ - ~all_categories.isin(cat.categories)], - inplace=True) # GH-13179 - self.grouper = self.grouper.reorder_categories( - cat.categories) + self.grouper = self.grouper._codes_for_groupby(self.sort) # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index acb2758641a62..5299a094156cd 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -550,6 +550,10 @@ def _append_same_dtype(self, to_concat, name): result.name = name return result + def _codes_for_groupby(self, sort): + """ Return a Categorical adjusted for groupby """ + return self.values._codes_for_groupby(sort) + @classmethod def _add_comparison_methods(cls): """ add in comparison methods """ From 55733b8c497b896662daf687969c80a2628beef6 Mon Sep 17 00:00:00 2001 From: Kernc Date: Sun, 19 Feb 2017 01:21:59 +0100 Subject: [PATCH 4/4] fixup! BUG: Fix .groupby(categorical, sort=False) failing --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/core/categorical.py | 37 ++++++++++++++++++++------------- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 8682d418f2b8f..fb8a708849abc 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -125,7 +125,7 @@ Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937 GroupBy on Categoricals ^^^^^^^^^^^^^^^^^^^^^^^ -In previous version, ``.groupby(..., sort=False)`` would fail with a ``ValueError`` when grouping on a categorical series with some categories not appearing in the data. (:issue:`13179`) +In previous versions, ``.groupby(..., sort=False)`` would fail with a ``ValueError`` when grouping on a categorical series with some categories not appearing in the data. (:issue:`13179`) Now, it works. diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 60d72d69d346b..c188f04d23873 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -604,7 +604,13 @@ def _get_categories(self): def _codes_for_groupby(self, sort): """ - Return a Categorical adjusted for groupby + If sort=False, return a copy of self, coded with categories as + returned by .unique(), followed by any categories not appearing in + the data. If sort=True, return self. + + This method is needed solely to ensure the categorical index of the + GroupBy result has categories in the order of appearance in the data + (GH-8868). Parameters ---------- @@ -614,21 +620,24 @@ def _codes_for_groupby(self, sort): Returns ------- Categorical - In case of sort=True, self is returned with original categories - preserved. In case of sort=False, the new categories are set - to the order of appearance in codes (unless ordered=True), - followed by any unrepresented categories in original order. + If sort=False, the new categories are set to the order of + appearance in codes (unless ordered=True, in which case the + original order is preserved), followed by any unrepresented + categories in the original order. """ - cat = self + if sort: + # Already sorted according to self.categories; all is fine + return self + # sort=False should order groups in as-encountered order (GH-8868) - if not sort: - cat = self.unique() - # But all categories should be present, including those missing - # from the data (GH-13179), which .unique() dropped - cat.add_categories(self.categories[ - ~self.categories.isin(cat.categories)], - inplace=True) - cat = self.reorder_categories(cat.categories) + cat = self.unique() + # But for groupby to work, all categories should be present, + # including those missing from the data (GH-13179), which .unique() + # above dropped + cat.add_categories( + self.categories[~self.categories.isin(cat.categories)], + inplace=True) + cat = self.reorder_categories(cat.categories) return cat _ordered = None