Skip to content

Commit 9cadc4e

Browse files
committed
ENH: union_categorical supports identical categories with ordered
1 parent 474fd05 commit 9cadc4e

File tree

3 files changed

+76
-15
lines changed

3 files changed

+76
-15
lines changed

doc/source/categorical.rst

+4-3
Original file line numberDiff line numberDiff line change
@@ -669,9 +669,10 @@ will be the union of the categories being combined.
669669
670670
.. note::
671671

672-
`union_categoricals` only works with unordered categoricals
673-
and will raise if any are ordered.
674-
672+
In addition to the "easy" case of combining two categoricals of the same
673+
categories and order information (e.g. what you could also ``append`` for),
674+
``union_categoricals`` only works with unordered categoricals and will
675+
raise if any are ordered.
675676

676677
Getting Data In/Out
677678
-------------------

pandas/tools/tests/test_concat.py

+53-8
Original file line numberDiff line numberDiff line change
@@ -870,23 +870,26 @@ def test_union_categorical(self):
870870
# new categories ordered by appearance
871871
s = Categorical(['x', 'y', 'z'])
872872
s2 = Categorical(['a', 'b', 'c'])
873-
result = union_categoricals([s, s2]).categories
874-
expected = Index(['x', 'y', 'z', 'a', 'b', 'c'])
875-
tm.assert_index_equal(result, expected)
873+
result = union_categoricals([s, s2])
874+
expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
875+
categories=['x', 'y', 'z', 'a', 'b', 'c'])
876+
tm.assert_categorical_equal(result, expected)
876877

877-
# can't be ordered
878878
s = Categorical([0, 1.2, 2], ordered=True)
879879
s2 = Categorical([0, 1.2, 2], ordered=True)
880-
with tm.assertRaises(TypeError):
881-
union_categoricals([s, s2])
880+
result = union_categoricals([s, s2])
881+
expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True)
882+
tm.assert_categorical_equal(result, expected)
882883

883884
# must exactly match types
884885
s = Categorical([0, 1.2, 2])
885886
s2 = Categorical([2, 3, 4])
886-
with tm.assertRaises(TypeError):
887+
msg = 'dtype of categories must be the same'
888+
with tm.assertRaisesRegexp(TypeError, msg):
887889
union_categoricals([s, s2])
888890

889-
with tm.assertRaises(ValueError):
891+
msg = 'No Categoricals to union'
892+
with tm.assertRaisesRegexp(ValueError, msg):
890893
union_categoricals([])
891894

892895
def test_union_categoricals_nan(self):
@@ -942,6 +945,48 @@ def test_union_categoricals_empty(self):
942945
pd.Categorical([])])
943946
tm.assert_categorical_equal(res, nanc)
944947

948+
def test_union_categorical_same_category(self):
949+
# check fastpath
950+
c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4])
951+
c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4])
952+
res = union_categoricals([c1, c2])
953+
exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan],
954+
categories=[1, 2, 3, 4])
955+
tm.assert_categorical_equal(res, exp)
956+
957+
c1 = Categorical(['z', 'z', 'z'], categories=['x', 'y', 'z'])
958+
c2 = Categorical(['x', 'x', 'x'], categories=['x', 'y', 'z'])
959+
res = union_categoricals([c1, c2])
960+
exp = Categorical(['z', 'z', 'z', 'x', 'x', 'x'],
961+
categories=['x', 'y', 'z'])
962+
tm.assert_categorical_equal(res, exp)
963+
964+
def test_union_categoricals_ordered(self):
965+
c1 = Categorical([1, 2, 3], ordered=True)
966+
c2 = Categorical([1, 2, 3], ordered=False)
967+
968+
msg = 'Categorical.ordered must be the same'
969+
with tm.assertRaisesRegexp(TypeError, msg):
970+
union_categoricals([c1, c2])
971+
972+
res = union_categoricals([c1, c1])
973+
exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True)
974+
tm.assert_categorical_equal(res, exp)
975+
976+
c1 = Categorical([1, 2, 3, np.nan], ordered=True)
977+
c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)
978+
979+
res = union_categoricals([c1, c2])
980+
exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True)
981+
tm.assert_categorical_equal(res, exp)
982+
983+
c1 = Categorical([1, 2, 3], ordered=True)
984+
c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)
985+
986+
msg = "to union ordered Categoricals, all categories must be the same"
987+
with tm.assertRaisesRegexp(TypeError, msg):
988+
union_categoricals([c1, c2])
989+
945990
def test_concat_bug_1719(self):
946991
ts1 = tm.makeTimeSeries()
947992
ts2 = tm.makeTimeSeries()[::2]

pandas/types/concat.py

+19-4
Original file line numberDiff line numberDiff line change
@@ -231,8 +231,9 @@ def union_categoricals(to_union):
231231
Raises
232232
------
233233
TypeError
234-
If any of the categoricals are ordered or all do not
235-
have the same dtype
234+
- all inputs do not have the same dtype
235+
- all inputs do not have the same ordered property
236+
- all inputs are ordered and their categories are not identical
236237
ValueError
237238
Emmpty list of categoricals passed
238239
"""
@@ -242,13 +243,27 @@ def union_categoricals(to_union):
242243
raise ValueError('No Categoricals to union')
243244

244245
first = to_union[0]
245-
if any(c.ordered for c in to_union):
246-
raise TypeError("Can only combine unordered Categoricals")
247246

248247
if not all(is_dtype_equal(c.categories.dtype, first.categories.dtype)
249248
for c in to_union):
250249
raise TypeError("dtype of categories must be the same")
251250

251+
if all(first.is_dtype_equal(other) for other in to_union[1:]):
252+
return Categorical(np.concatenate([c.codes for c in to_union]),
253+
categories=first.categories, ordered=first.ordered,
254+
fastpath=True)
255+
elif all(not c.ordered for c in to_union):
256+
# not ordered
257+
pass
258+
else:
259+
# to show a proper error message
260+
if all(c.ordered for c in to_union):
261+
msg = ("to union ordered Categoricals, "
262+
"all categories must be the same")
263+
raise TypeError(msg)
264+
else:
265+
raise TypeError('Categorical.ordered must be the same')
266+
252267
cats = first.categories
253268
unique_cats = cats.append([c.categories for c in to_union[1:]]).unique()
254269
categories = Index(unique_cats)

0 commit comments

Comments
 (0)