Skip to content

BUG: union_categorical with Series and cat idx #14199

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions doc/source/categorical.rst
Original file line number Diff line number Diff line change
Expand Up @@ -695,6 +695,40 @@ The below raises ``TypeError`` because the categories are ordered and not identi
Out[3]:
TypeError: to union ordered Categoricals, all categories must be the same

``union_categoricals`` also works with a ``CategoricalIndex``, or ``Series`` containing
categorical data, but note that the resulting array will always be a plain ``Categorical``

.. ipython:: python

a = pd.Series(["b", "c"], dtype='category')
b = pd.Series(["a", "b"], dtype='category')
union_categoricals([a, b])

.. note::

``union_categoricals`` may recode the integer codes for categories
when combining categoricals. This is likely what you want,
but if you are relying on the exact numbering of the categories, be
aware.

.. ipython:: python

c1 = pd.Categorical(["b", "c"])
c2 = pd.Categorical(["a", "b"])

c1
# "b" is coded to 0
c1.codes

c2
# "b" is coded to 1
c2.codes

c = union_categoricals([c1, c2])
c
# "b" is coded to 0 throughout, same as c1, different from c2
c.codes

.. _categorical.concat:

Concatenation
Expand Down
31 changes: 26 additions & 5 deletions pandas/tools/tests/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from pandas import (DataFrame, concat,
read_csv, isnull, Series, date_range,
Index, Panel, MultiIndex, Timestamp,
DatetimeIndex, Categorical)
DatetimeIndex, Categorical, CategoricalIndex)
from pandas.types.concat import union_categoricals
from pandas.util import testing as tm
from pandas.util.testing import (assert_frame_equal,
Expand Down Expand Up @@ -1539,10 +1539,12 @@ def test_union_categorical(self):
]

for a, b, combined in data:
result = union_categoricals([Categorical(a), Categorical(b)])
expected = Categorical(combined)
tm.assert_categorical_equal(result, expected,
check_category_order=True)
for box in [Categorical, CategoricalIndex, Series]:
result = union_categoricals([box(Categorical(a)),
box(Categorical(b))])
expected = Categorical(combined)
tm.assert_categorical_equal(result, expected,
check_category_order=True)

# new categories ordered by appearance
s = Categorical(['x', 'y', 'z'])
Expand Down Expand Up @@ -1771,6 +1773,25 @@ def test_union_categoricals_sort_false(self):
categories=['b', 'a', 'c'], ordered=True)
tm.assert_categorical_equal(result, expected)

def test_union_categorical_unwrap(self):
# GH 14173
c1 = Categorical(['a', 'b'])
c2 = pd.Series(['b', 'c'], dtype='category')
result = union_categoricals([c1, c2])
expected = Categorical(['a', 'b', 'b', 'c'])
tm.assert_categorical_equal(result, expected)

c2 = CategoricalIndex(c2)
result = union_categoricals([c1, c2])
tm.assert_categorical_equal(result, expected)

c1 = Series(c1)
result = union_categoricals([c1, c2])
tm.assert_categorical_equal(result, expected)

with tm.assertRaises(TypeError):
union_categoricals([c1, ['a', 'b', 'c']])

def test_concat_bug_1719(self):
ts1 = tm.makeTimeSeries()
ts2 = tm.makeTimeSeries()[::2]
Expand Down
16 changes: 13 additions & 3 deletions pandas/types/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,14 +210,15 @@ def _concat_asobject(to_concat):

def union_categoricals(to_union, sort_categories=False):
"""
Combine list-like of Categoricals, unioning categories. All
Combine list-like of Categorical-like, unioning categories. All
categories must have the same dtype.

.. versionadded:: 0.19.0

Parameters
----------
to_union : list-like of Categoricals
to_union : list-like of Categorical, CategoricalIndex,
or Series with dtype='category'
sort_categories : boolean, default False
If true, resulting categories will be lexsorted, otherwise
they will be ordered as they appear in the data.
Expand All @@ -236,11 +237,20 @@ def union_categoricals(to_union, sort_categories=False):
ValueError
Emmpty list of categoricals passed
"""
from pandas import Index, Categorical
from pandas import Index, Categorical, CategoricalIndex, Series

if len(to_union) == 0:
raise ValueError('No Categoricals to union')

def _maybe_unwrap(x):
if isinstance(x, (CategoricalIndex, Series)):
return x.values
elif isinstance(x, Categorical):
return x
else:
raise TypeError("all components to combine must be Categorical")

to_union = [_maybe_unwrap(x) for x in to_union]
first = to_union[0]

if not all(is_dtype_equal(other.categories.dtype, first.categories.dtype)
Expand Down