Skip to content

Commit 59f2557

Browse files
sinhrksjreback
authored andcommitted
ENH: union_categorical supports identical categories with ordered
xref pandas-dev#13410, pandas-dev#13524 Author: sinhrks <[email protected]> Closes pandas-dev#13763 from sinhrks/union_categoricals_ordered and squashes the following commits: 9cadc4e [sinhrks] ENH: union_categorical supports identical categories with ordered
1 parent 54b2777 commit 59f2557

File tree

3 files changed

+76
-15
lines changed

3 files changed

+76
-15
lines changed

doc/source/categorical.rst

+4-3
Original file line numberDiff line numberDiff line change
@@ -669,9 +669,10 @@ will be the union of the categories being combined.
669669
670670
.. note::
671671

672-
`union_categoricals` only works with unordered categoricals
673-
and will raise if any are ordered.
674-
672+
In addition to the "easy" case of combining two categoricals of the same
673+
categories and order information (e.g. what you could also ``append`` for),
674+
``union_categoricals`` only works with unordered categoricals and will
675+
raise if any are ordered.
675676

676677
Getting Data In/Out
677678
-------------------

pandas/tools/tests/test_concat.py

+53-8
Original file line numberDiff line numberDiff line change
@@ -872,23 +872,26 @@ def test_union_categorical(self):
872872
# new categories ordered by appearance
873873
s = Categorical(['x', 'y', 'z'])
874874
s2 = Categorical(['a', 'b', 'c'])
875-
result = union_categoricals([s, s2]).categories
876-
expected = Index(['x', 'y', 'z', 'a', 'b', 'c'])
877-
tm.assert_index_equal(result, expected)
875+
result = union_categoricals([s, s2])
876+
expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
877+
categories=['x', 'y', 'z', 'a', 'b', 'c'])
878+
tm.assert_categorical_equal(result, expected)
878879

879-
# can't be ordered
880880
s = Categorical([0, 1.2, 2], ordered=True)
881881
s2 = Categorical([0, 1.2, 2], ordered=True)
882-
with tm.assertRaises(TypeError):
883-
union_categoricals([s, s2])
882+
result = union_categoricals([s, s2])
883+
expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True)
884+
tm.assert_categorical_equal(result, expected)
884885

885886
# must exactly match types
886887
s = Categorical([0, 1.2, 2])
887888
s2 = Categorical([2, 3, 4])
888-
with tm.assertRaises(TypeError):
889+
msg = 'dtype of categories must be the same'
890+
with tm.assertRaisesRegexp(TypeError, msg):
889891
union_categoricals([s, s2])
890892

891-
with tm.assertRaises(ValueError):
893+
msg = 'No Categoricals to union'
894+
with tm.assertRaisesRegexp(ValueError, msg):
892895
union_categoricals([])
893896

894897
def test_union_categoricals_nan(self):
@@ -944,6 +947,48 @@ def test_union_categoricals_empty(self):
944947
pd.Categorical([])])
945948
tm.assert_categorical_equal(res, nanc)
946949

950+
def test_union_categorical_same_category(self):
951+
# check fastpath
952+
c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4])
953+
c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4])
954+
res = union_categoricals([c1, c2])
955+
exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan],
956+
categories=[1, 2, 3, 4])
957+
tm.assert_categorical_equal(res, exp)
958+
959+
c1 = Categorical(['z', 'z', 'z'], categories=['x', 'y', 'z'])
960+
c2 = Categorical(['x', 'x', 'x'], categories=['x', 'y', 'z'])
961+
res = union_categoricals([c1, c2])
962+
exp = Categorical(['z', 'z', 'z', 'x', 'x', 'x'],
963+
categories=['x', 'y', 'z'])
964+
tm.assert_categorical_equal(res, exp)
965+
966+
def test_union_categoricals_ordered(self):
967+
c1 = Categorical([1, 2, 3], ordered=True)
968+
c2 = Categorical([1, 2, 3], ordered=False)
969+
970+
msg = 'Categorical.ordered must be the same'
971+
with tm.assertRaisesRegexp(TypeError, msg):
972+
union_categoricals([c1, c2])
973+
974+
res = union_categoricals([c1, c1])
975+
exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True)
976+
tm.assert_categorical_equal(res, exp)
977+
978+
c1 = Categorical([1, 2, 3, np.nan], ordered=True)
979+
c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)
980+
981+
res = union_categoricals([c1, c2])
982+
exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True)
983+
tm.assert_categorical_equal(res, exp)
984+
985+
c1 = Categorical([1, 2, 3], ordered=True)
986+
c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)
987+
988+
msg = "to union ordered Categoricals, all categories must be the same"
989+
with tm.assertRaisesRegexp(TypeError, msg):
990+
union_categoricals([c1, c2])
991+
947992
def test_concat_bug_1719(self):
948993
ts1 = tm.makeTimeSeries()
949994
ts2 = tm.makeTimeSeries()[::2]

pandas/types/concat.py

+19-4
Original file line numberDiff line numberDiff line change
@@ -231,8 +231,9 @@ def union_categoricals(to_union):
231231
Raises
232232
------
233233
TypeError
234-
If any of the categoricals are ordered or all do not
235-
have the same dtype
234+
- all inputs do not have the same dtype
235+
- all inputs do not have the same ordered property
236+
- all inputs are ordered and their categories are not identical
236237
ValueError
237238
Emmpty list of categoricals passed
238239
"""
@@ -242,13 +243,27 @@ def union_categoricals(to_union):
242243
raise ValueError('No Categoricals to union')
243244

244245
first = to_union[0]
245-
if any(c.ordered for c in to_union):
246-
raise TypeError("Can only combine unordered Categoricals")
247246

248247
if not all(is_dtype_equal(c.categories.dtype, first.categories.dtype)
249248
for c in to_union):
250249
raise TypeError("dtype of categories must be the same")
251250

251+
if all(first.is_dtype_equal(other) for other in to_union[1:]):
252+
return Categorical(np.concatenate([c.codes for c in to_union]),
253+
categories=first.categories, ordered=first.ordered,
254+
fastpath=True)
255+
elif all(not c.ordered for c in to_union):
256+
# not ordered
257+
pass
258+
else:
259+
# to show a proper error message
260+
if all(c.ordered for c in to_union):
261+
msg = ("to union ordered Categoricals, "
262+
"all categories must be the same")
263+
raise TypeError(msg)
264+
else:
265+
raise TypeError('Categorical.ordered must be the same')
266+
252267
cats = first.categories
253268
unique_cats = cats.append([c.categories for c in to_union[1:]]).unique()
254269
categories = Index(unique_cats)

0 commit comments

Comments
 (0)