Skip to content

ENH: add sort_categories argument to union_categoricals #13846

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion doc/source/categorical.rst
Original file line number Diff line number Diff line change
Expand Up @@ -656,7 +656,7 @@ Unioning
.. versionadded:: 0.19.0

If you want to combine categoricals that do not necessarily have
the same categories, the `union_categorical` function will
the same categories, the ``union_categoricals`` function will
combine a list-like of categoricals. The new categories
will be the union of the categories being combined.

Expand All @@ -667,6 +667,14 @@ will be the union of the categories being combined.
b = pd.Categorical(["a", "b"])
union_categoricals([a, b])

By default, the resulting categories will be ordered as
they appear in the data. If you want the categories to
be lexsorted, use ``sort_categories=True`` argument.

.. ipython:: python

union_categoricals([a, b], sort_categories=True)

.. note::

In addition to the "easy" case of combining two categoricals of the same
Expand Down
36 changes: 36 additions & 0 deletions pandas/tools/tests/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -989,6 +989,42 @@ def test_union_categoricals_ordered(self):
with tm.assertRaisesRegexp(TypeError, msg):
union_categoricals([c1, c2])

def test_union_categoricals_sort(self):
# GH 13763
c1 = Categorical(['x', 'y', 'z'])
c2 = Categorical(['a', 'b', 'c'])
result = union_categoricals([c1, c2], sort_categories=True)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you put companion tests with sort_categories=False

expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
categories=['a', 'b', 'c', 'x', 'y', 'z'])
tm.assert_categorical_equal(result, expected)

# fastpath
c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c'])
c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c'])
result = union_categoricals([c1, c2], sort_categories=True)
expected = Categorical(['a', 'b', 'b', 'c'],
categories=['a', 'b', 'c'])
tm.assert_categorical_equal(result, expected)

c1 = Categorical(['x', np.nan])
c2 = Categorical([np.nan, 'b'])
result = union_categoricals([c1, c2], sort_categories=True)
expected = Categorical(['x', np.nan, np.nan, 'b'],
categories=['b', 'x'])
tm.assert_categorical_equal(result, expected)

c1 = Categorical([np.nan])
c2 = Categorical([np.nan])
result = union_categoricals([c1, c2], sort_categories=True)
expected = Categorical([np.nan, np.nan], categories=[])
tm.assert_categorical_equal(result, expected)

c1 = Categorical([])
c2 = Categorical([])
result = union_categoricals([c1, c2], sort_categories=True)
expected = Categorical([])
tm.assert_categorical_equal(result, expected)

def test_concat_bug_1719(self):
ts1 = tm.makeTimeSeries()
ts2 = tm.makeTimeSeries()[::2]
Expand Down
63 changes: 35 additions & 28 deletions pandas/types/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,22 +211,23 @@ def convert_categorical(x):
return Categorical(concatted, rawcats)


def union_categoricals(to_union):
def union_categoricals(to_union, sort_categories=False):
"""
Combine list-like of Categoricals, unioning categories. All
must have the same dtype, and none can be ordered.
categories must have the same dtype.

.. versionadded:: 0.19.0

Parameters
----------
to_union : list-like of Categoricals
sort_categories : boolean, default False
If true, resulting categories will be lexsorted, otherwise
they will be ordered as they appear in the data

Returns
-------
Categorical
A single array, categories will be ordered as they
appear in the list
result : Categorical

Raises
------
Expand All @@ -244,41 +245,47 @@ def union_categoricals(to_union):

first = to_union[0]

if not all(is_dtype_equal(c.categories.dtype, first.categories.dtype)
for c in to_union):
if not all(is_dtype_equal(other.categories.dtype, first.categories.dtype)
for other in to_union[1:]):
raise TypeError("dtype of categories must be the same")

ordered = False
if all(first.is_dtype_equal(other) for other in to_union[1:]):
return Categorical(np.concatenate([c.codes for c in to_union]),
categories=first.categories, ordered=first.ordered,
fastpath=True)
# identical categories - fastpath
categories = first.categories
ordered = first.ordered
new_codes = np.concatenate([c.codes for c in to_union])

if sort_categories:
categories = categories.sort_values()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think sort can be skipped if categories is monotonic-increasing.

indexer = first.categories.get_indexer(categories)
new_codes = take_1d(indexer, new_codes, fill_value=-1)
elif all(not c.ordered for c in to_union):
# not ordered
pass
# different categories - union and recode
cats = first.categories.append([c.categories for c in to_union[1:]])
categories = Index(cats.unique())
if sort_categories:
categories = categories.sort_values()

new_codes = []
for c in to_union:
if len(c.categories) > 0:
indexer = categories.get_indexer(c.categories)
new_codes.append(take_1d(indexer, c.codes, fill_value=-1))
else:
# must be all NaN
new_codes.append(c.codes)
new_codes = np.concatenate(new_codes)
else:
# to show a proper error message
# ordered - to show a proper error message
if all(c.ordered for c in to_union):
msg = ("to union ordered Categoricals, "
"all categories must be the same")
raise TypeError(msg)
else:
raise TypeError('Categorical.ordered must be the same')

cats = first.categories
unique_cats = cats.append([c.categories for c in to_union[1:]]).unique()
categories = Index(unique_cats)

new_codes = []
for c in to_union:
if len(c.categories) > 0:
indexer = categories.get_indexer(c.categories)
new_codes.append(take_1d(indexer, c.codes, fill_value=-1))
else:
# must be all NaN
new_codes.append(c.codes)

new_codes = np.concatenate(new_codes)
return Categorical(new_codes, categories=categories, ordered=False,
return Categorical(new_codes, categories=categories, ordered=ordered,
fastpath=True)


Expand Down