Skip to content

Commit 3db3365

Browse files
chris-b1jreback
authored andcommitted
ENH: add sort_categories argument to union_categoricals
- needed for #13406, follow-up to #13763 Author: Chris <[email protected]> Author: sinhrks <[email protected]> Closes #13846 from chris-b1/union_categoricals_ordered and squashes the following commits: 3a710f0 [Chris] lint fix ff0bb5e [Chris] add follow-up PRs to whatsnew ecb2ae9 [Chris] more tests; handle sorth with ordered eea1777 [Chris] skip r-esort when possible on fastpath c559662 [sinhrks] ENH: add sort_categories argument to union_categoricals
1 parent caf69d5 commit 3db3365

File tree

4 files changed

+150
-30
lines changed

4 files changed

+150
-30
lines changed

doc/source/categorical.rst

+9-1
Original file line numberDiff line numberDiff line change
@@ -656,7 +656,7 @@ Unioning
656656
.. versionadded:: 0.19.0
657657

658658
If you want to combine categoricals that do not necessarily have
659-
the same categories, the `union_categorical` function will
659+
the same categories, the ``union_categoricals`` function will
660660
combine a list-like of categoricals. The new categories
661661
will be the union of the categories being combined.
662662

@@ -667,6 +667,14 @@ will be the union of the categories being combined.
667667
b = pd.Categorical(["a", "b"])
668668
union_categoricals([a, b])
669669
670+
By default, the resulting categories will be ordered as
671+
they appear in the data. If you want the categories to
672+
be lexsorted, use ``sort_categories=True`` argument.
673+
674+
.. ipython:: python
675+
676+
union_categoricals([a, b], sort_categories=True)
677+
670678
.. note::
671679

672680
In addition to the "easy" case of combining two categoricals of the same

doc/source/whatsnew/v0.19.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -336,7 +336,7 @@ Other enhancements
336336
- Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`)
337337
- The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`)
338338
- ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`)
339-
- A function :func:`union_categorical` has been added for combining categoricals, see :ref:`Unioning Categoricals<categorical.union>` (:issue:`13361`)
339+
- A function :func:`union_categorical` has been added for combining categoricals, see :ref:`Unioning Categoricals<categorical.union>` (:issue:`13361`, :issue:`:13763`, issue:`13846')
340340
- ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`)
341341
- ``DataFrame.to_sql()`` now allows a single value as the SQL type for all columns (:issue:`11886`).
342342
- ``Series.append`` now supports the ``ignore_index`` option (:issue:`13677`)

pandas/tools/tests/test_concat.py

+100
Original file line numberDiff line numberDiff line change
@@ -989,6 +989,106 @@ def test_union_categoricals_ordered(self):
989989
with tm.assertRaisesRegexp(TypeError, msg):
990990
union_categoricals([c1, c2])
991991

992+
def test_union_categoricals_sort(self):
993+
# GH 13846
994+
c1 = Categorical(['x', 'y', 'z'])
995+
c2 = Categorical(['a', 'b', 'c'])
996+
result = union_categoricals([c1, c2], sort_categories=True)
997+
expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
998+
categories=['a', 'b', 'c', 'x', 'y', 'z'])
999+
tm.assert_categorical_equal(result, expected)
1000+
1001+
# fastpath
1002+
c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c'])
1003+
c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c'])
1004+
result = union_categoricals([c1, c2], sort_categories=True)
1005+
expected = Categorical(['a', 'b', 'b', 'c'],
1006+
categories=['a', 'b', 'c'])
1007+
tm.assert_categorical_equal(result, expected)
1008+
1009+
# fastpath - skip resort
1010+
c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c'])
1011+
c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c'])
1012+
result = union_categoricals([c1, c2], sort_categories=True)
1013+
expected = Categorical(['a', 'b', 'b', 'c'],
1014+
categories=['a', 'b', 'c'])
1015+
tm.assert_categorical_equal(result, expected)
1016+
1017+
c1 = Categorical(['x', np.nan])
1018+
c2 = Categorical([np.nan, 'b'])
1019+
result = union_categoricals([c1, c2], sort_categories=True)
1020+
expected = Categorical(['x', np.nan, np.nan, 'b'],
1021+
categories=['b', 'x'])
1022+
tm.assert_categorical_equal(result, expected)
1023+
1024+
c1 = Categorical([np.nan])
1025+
c2 = Categorical([np.nan])
1026+
result = union_categoricals([c1, c2], sort_categories=True)
1027+
expected = Categorical([np.nan, np.nan], categories=[])
1028+
tm.assert_categorical_equal(result, expected)
1029+
1030+
c1 = Categorical([])
1031+
c2 = Categorical([])
1032+
result = union_categoricals([c1, c2], sort_categories=True)
1033+
expected = Categorical([])
1034+
tm.assert_categorical_equal(result, expected)
1035+
1036+
c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True)
1037+
c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True)
1038+
with tm.assertRaises(TypeError):
1039+
union_categoricals([c1, c2], sort_categories=True)
1040+
1041+
def test_union_categoricals_sort_false(self):
1042+
# GH 13846
1043+
c1 = Categorical(['x', 'y', 'z'])
1044+
c2 = Categorical(['a', 'b', 'c'])
1045+
result = union_categoricals([c1, c2], sort_categories=False)
1046+
expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
1047+
categories=['x', 'y', 'z', 'a', 'b', 'c'])
1048+
tm.assert_categorical_equal(result, expected)
1049+
1050+
# fastpath
1051+
c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c'])
1052+
c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c'])
1053+
result = union_categoricals([c1, c2], sort_categories=False)
1054+
expected = Categorical(['a', 'b', 'b', 'c'],
1055+
categories=['b', 'a', 'c'])
1056+
tm.assert_categorical_equal(result, expected)
1057+
1058+
# fastpath - skip resort
1059+
c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c'])
1060+
c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c'])
1061+
result = union_categoricals([c1, c2], sort_categories=False)
1062+
expected = Categorical(['a', 'b', 'b', 'c'],
1063+
categories=['a', 'b', 'c'])
1064+
tm.assert_categorical_equal(result, expected)
1065+
1066+
c1 = Categorical(['x', np.nan])
1067+
c2 = Categorical([np.nan, 'b'])
1068+
result = union_categoricals([c1, c2], sort_categories=False)
1069+
expected = Categorical(['x', np.nan, np.nan, 'b'],
1070+
categories=['x', 'b'])
1071+
tm.assert_categorical_equal(result, expected)
1072+
1073+
c1 = Categorical([np.nan])
1074+
c2 = Categorical([np.nan])
1075+
result = union_categoricals([c1, c2], sort_categories=False)
1076+
expected = Categorical([np.nan, np.nan], categories=[])
1077+
tm.assert_categorical_equal(result, expected)
1078+
1079+
c1 = Categorical([])
1080+
c2 = Categorical([])
1081+
result = union_categoricals([c1, c2], sort_categories=False)
1082+
expected = Categorical([])
1083+
tm.assert_categorical_equal(result, expected)
1084+
1085+
c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True)
1086+
c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True)
1087+
result = union_categoricals([c1, c2], sort_categories=False)
1088+
expected = Categorical(['b', 'a', 'a', 'c'],
1089+
categories=['b', 'a', 'c'], ordered=True)
1090+
tm.assert_categorical_equal(result, expected)
1091+
9921092
def test_concat_bug_1719(self):
9931093
ts1 = tm.makeTimeSeries()
9941094
ts2 = tm.makeTimeSeries()[::2]

pandas/types/concat.py

+40-28
Original file line numberDiff line numberDiff line change
@@ -211,29 +211,31 @@ def convert_categorical(x):
211211
return Categorical(concatted, rawcats)
212212

213213

214-
def union_categoricals(to_union):
214+
def union_categoricals(to_union, sort_categories=False):
215215
"""
216216
Combine list-like of Categoricals, unioning categories. All
217-
must have the same dtype, and none can be ordered.
217+
categories must have the same dtype.
218218
219219
.. versionadded:: 0.19.0
220220
221221
Parameters
222222
----------
223223
to_union : list-like of Categoricals
224+
sort_categories : boolean, default False
225+
If true, resulting categories will be lexsorted, otherwise
226+
they will be ordered as they appear in the data.
224227
225228
Returns
226229
-------
227-
Categorical
228-
A single array, categories will be ordered as they
229-
appear in the list
230+
result : Categorical
230231
231232
Raises
232233
------
233234
TypeError
234235
- all inputs do not have the same dtype
235236
- all inputs do not have the same ordered property
236237
- all inputs are ordered and their categories are not identical
238+
- sort_categories=True and Categoricals are ordered
237239
ValueError
238240
Emmpty list of categoricals passed
239241
"""
@@ -244,41 +246,51 @@ def union_categoricals(to_union):
244246

245247
first = to_union[0]
246248

247-
if not all(is_dtype_equal(c.categories.dtype, first.categories.dtype)
248-
for c in to_union):
249+
if not all(is_dtype_equal(other.categories.dtype, first.categories.dtype)
250+
for other in to_union[1:]):
249251
raise TypeError("dtype of categories must be the same")
250252

253+
ordered = False
251254
if all(first.is_dtype_equal(other) for other in to_union[1:]):
252-
return Categorical(np.concatenate([c.codes for c in to_union]),
253-
categories=first.categories, ordered=first.ordered,
254-
fastpath=True)
255+
# identical categories - fastpath
256+
categories = first.categories
257+
ordered = first.ordered
258+
new_codes = np.concatenate([c.codes for c in to_union])
259+
260+
if sort_categories and ordered:
261+
raise TypeError("Cannot use sort_categories=True with "
262+
"ordered Categoricals")
263+
264+
if sort_categories and not categories.is_monotonic_increasing:
265+
categories = categories.sort_values()
266+
indexer = first.categories.get_indexer(categories)
267+
new_codes = take_1d(indexer, new_codes, fill_value=-1)
255268
elif all(not c.ordered for c in to_union):
256-
# not ordered
257-
pass
269+
# different categories - union and recode
270+
cats = first.categories.append([c.categories for c in to_union[1:]])
271+
categories = Index(cats.unique())
272+
if sort_categories:
273+
categories = categories.sort_values()
274+
275+
new_codes = []
276+
for c in to_union:
277+
if len(c.categories) > 0:
278+
indexer = categories.get_indexer(c.categories)
279+
new_codes.append(take_1d(indexer, c.codes, fill_value=-1))
280+
else:
281+
# must be all NaN
282+
new_codes.append(c.codes)
283+
new_codes = np.concatenate(new_codes)
258284
else:
259-
# to show a proper error message
285+
# ordered - to show a proper error message
260286
if all(c.ordered for c in to_union):
261287
msg = ("to union ordered Categoricals, "
262288
"all categories must be the same")
263289
raise TypeError(msg)
264290
else:
265291
raise TypeError('Categorical.ordered must be the same')
266292

267-
cats = first.categories
268-
unique_cats = cats.append([c.categories for c in to_union[1:]]).unique()
269-
categories = Index(unique_cats)
270-
271-
new_codes = []
272-
for c in to_union:
273-
if len(c.categories) > 0:
274-
indexer = categories.get_indexer(c.categories)
275-
new_codes.append(take_1d(indexer, c.codes, fill_value=-1))
276-
else:
277-
# must be all NaN
278-
new_codes.append(c.codes)
279-
280-
new_codes = np.concatenate(new_codes)
281-
return Categorical(new_codes, categories=categories, ordered=False,
293+
return Categorical(new_codes, categories=categories, ordered=ordered,
282294
fastpath=True)
283295

284296

0 commit comments

Comments
 (0)