Skip to content

BUG: Categorical.unique() preserves categories #15439

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,43 @@ def time_groupby_sum(self):
self.df.groupby(['a'])['b'].sum()


class groupby_categorical(object):
goal_time = 0.2

def setup(self):
N = 100000
arr = np.random.random(N)

self.df = DataFrame(dict(
a=Categorical(np.random.randint(10000, size=N)),
b=arr))
self.df_ordered = DataFrame(dict(
a=Categorical(np.random.randint(10000, size=N), ordered=True),
b=arr))
self.df_extra_cat = DataFrame(dict(
a=Categorical(np.random.randint(100, size=N),
categories=np.arange(10000)),
b=arr))

def time_groupby_sort(self):
self.df.groupby('a')['b'].count()

def time_groupby_nosort(self):
self.df.groupby('a', sort=False)['b'].count()

def time_groupby_ordered_sort(self):
self.df_ordered.groupby('a')['b'].count()

def time_groupby_ordered_nosort(self):
self.df_ordered.groupby('a', sort=False)['b'].count()

def time_groupby_extra_cat_sort(self):
self.df_extra_cat.groupby('a')['b'].count()

def time_groupby_extra_cat_nosort(self):
self.df_extra_cat.groupby('a', sort=False)['b'].count()


class groupby_period(object):
# GH 14338
goal_time = 0.2
Expand Down
37 changes: 36 additions & 1 deletion doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,42 @@ Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937
- Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
- Bug in ``pd.value_counts()`` in which unsigned 64-bit integers were being erroneously truncated in the output (:issue:`14934`)

.. _whatsnew_0200.enhancements.groupy_categorical

GroupBy on Categoricals
^^^^^^^^^^^^^^^^^^^^^^^

In previous versions, ``.groupby(..., sort=False)`` would fail with a ``ValueError`` when grouping on a categorical series with some categories not appearing in the data. (:issue:`13179`)

Now, it works.

.. ipython:: python

chromosomes = np.r_[np.arange(1, 23).astype(str), ['X', 'Y']]
df = pd.DataFrame({
'A': np.random.randint(100),
'B': np.random.randint(100),
'C': np.random.randint(100),
'chromosomes': pd.Categorical(np.random.choice(chromosomes, 100),
categories=chromosomes,
ordered=True)})

Previous Behavior:

.. code-block:: ipython

In [3]: df[df.chromosomes != '1'].groupby('chromosomes', sort=False).sum()
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
...
ValueError: items in new_categories are not the same as in old categories

New Behavior:

.. ipython:: python

df[df.chromosomes != '1'].groupby('chromosomes', sort=False).sum()

.. _whatsnew_0200.enhancements.other:

Other enhancements
Expand Down Expand Up @@ -160,7 +196,6 @@ Other enhancements

.. _whatsnew_0200.api_breaking:


Backwards incompatible API changes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Expand Down
38 changes: 38 additions & 0 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -602,6 +602,44 @@ def _get_categories(self):
categories = property(fget=_get_categories, fset=_set_categories,
doc=_categories_doc)

def _codes_for_groupby(self, sort):
"""
If sort=False, return a copy of self, coded with categories as
returned by .unique(), followed by any categories not appearing in
the data. If sort=True, return self.

This method is needed solely to ensure the categorical index of the
GroupBy result has categories in the order of appearance in the data
(GH-8868).

Parameters
----------
sort : boolean
The value of the sort paramter groupby was called with.

Returns
-------
Categorical
If sort=False, the new categories are set to the order of
appearance in codes (unless ordered=True, in which case the
original order is preserved), followed by any unrepresented
categories in the original order.
"""
if sort:
# Already sorted according to self.categories; all is fine
return self

# sort=False should order groups in as-encountered order (GH-8868)
cat = self.unique()
# But for groupby to work, all categories should be present,
# including those missing from the data (GH-13179), which .unique()
# above dropped
cat.add_categories(
self.categories[~self.categories.isin(cat.categories)],
inplace=True)
cat = self.reorder_categories(cat.categories)
return cat

_ordered = None

def set_ordered(self, value, inplace=False):
Expand Down
18 changes: 1 addition & 17 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2300,23 +2300,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
# a passed Categorical
elif is_categorical_dtype(self.grouper):

# must have an ordered categorical
if self.sort:
if not self.grouper.ordered:

# technically we cannot group on an unordered
# Categorical
# but this a user convenience to do so; the ordering
# is preserved and if it's a reduction it doesn't make
# any difference
pass

# fix bug #GH8868 sort=False being ignored in categorical
# groupby
else:
cat = self.grouper.unique()
self.grouper = self.grouper.reorder_categories(
cat.categories)
self.grouper = self.grouper._codes_for_groupby(self.sort)

# we make a CategoricalIndex out of the cat grouper
# preserving the categories / ordered attributes
Expand Down
4 changes: 4 additions & 0 deletions pandas/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -550,6 +550,10 @@ def _append_same_dtype(self, to_concat, name):
result.name = name
return result

def _codes_for_groupby(self, sort):
""" Return a Categorical adjusted for groupby """
return self.values._codes_for_groupby(sort)

@classmethod
def _add_comparison_methods(cls):
""" add in comparison methods """
Expand Down
24 changes: 24 additions & 0 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,30 @@ def test_groupby_multi_categorical_as_index(self):

tm.assert_frame_equal(result, expected, check_index_type=True)

def test_groupby_preserve_categories(self):
# GH-13179
categories = list('abc')

# ordered=True
df = DataFrame({'A': pd.Categorical(list('ba'),
categories=categories,
ordered=True)})
index = pd.CategoricalIndex(categories, categories, ordered=True)
tm.assert_index_equal(df.groupby('A', sort=True).first().index, index)
tm.assert_index_equal(df.groupby('A', sort=False).first().index, index)

# ordered=False
df = DataFrame({'A': pd.Categorical(list('ba'),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIRC from the issue there are 4 cases? (product of sort=True/False, ordered=True/False)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

groupby above and below is called twice (sort=True/False) for each case of ordered. There are four assertions.

categories=categories,
ordered=False)})
sort_index = pd.CategoricalIndex(categories, categories, ordered=False)
nosort_index = pd.CategoricalIndex(list('bac'), list('bac'),
ordered=False)
tm.assert_index_equal(df.groupby('A', sort=True).first().index,
sort_index)
tm.assert_index_equal(df.groupby('A', sort=False).first().index,
nosort_index)

def test_groupby_preserve_categorical_dtype(self):
# GH13743, GH13854
df = DataFrame({'A': [1, 2, 1, 1, 2],
Expand Down