@@ -211,22 +211,23 @@ def convert_categorical(x):
211
211
return Categorical (concatted , rawcats )
212
212
213
213
214
- def union_categoricals (to_union ):
214
+ def union_categoricals (to_union , sort_categories = False ):
215
215
"""
216
216
Combine list-like of Categoricals, unioning categories. All
217
- must have the same dtype, and none can be ordered .
217
+ categories must have the same dtype.
218
218
219
219
.. versionadded:: 0.19.0
220
220
221
221
Parameters
222
222
----------
223
223
to_union : list-like of Categoricals
224
+ sort_categories : boolean, default False
225
+ If true, resulting categories will be lexsorted, otherwise
226
+ they will be ordered as they appear in the data
224
227
225
228
Returns
226
229
-------
227
- Categorical
228
- A single array, categories will be ordered as they
229
- appear in the list
230
+ result : Categorical
230
231
231
232
Raises
232
233
------
@@ -244,41 +245,47 @@ def union_categoricals(to_union):
244
245
245
246
first = to_union [0 ]
246
247
247
- if not all (is_dtype_equal (c .categories .dtype , first .categories .dtype )
248
- for c in to_union ):
248
+ if not all (is_dtype_equal (other .categories .dtype , first .categories .dtype )
249
+ for other in to_union [ 1 :] ):
249
250
raise TypeError ("dtype of categories must be the same" )
250
251
252
+ ordered = False
251
253
if all (first .is_dtype_equal (other ) for other in to_union [1 :]):
252
- return Categorical (np .concatenate ([c .codes for c in to_union ]),
253
- categories = first .categories , ordered = first .ordered ,
254
- fastpath = True )
254
+ # identical categories - fastpath
255
+ categories = first .categories
256
+ ordered = first .ordered
257
+ new_codes = np .concatenate ([c .codes for c in to_union ])
258
+
259
+ if sort_categories :
260
+ categories = categories .sort_values ()
261
+ indexer = first .categories .get_indexer (categories )
262
+ new_codes = take_1d (indexer , new_codes , fill_value = - 1 )
255
263
elif all (not c .ordered for c in to_union ):
256
- # not ordered
257
- pass
264
+ # different categories - union and recode
265
+ cats = first .categories .append ([c .categories for c in to_union [1 :]])
266
+ categories = Index (cats .unique ())
267
+ if sort_categories :
268
+ categories = categories .sort_values ()
269
+
270
+ new_codes = []
271
+ for c in to_union :
272
+ if len (c .categories ) > 0 :
273
+ indexer = categories .get_indexer (c .categories )
274
+ new_codes .append (take_1d (indexer , c .codes , fill_value = - 1 ))
275
+ else :
276
+ # must be all NaN
277
+ new_codes .append (c .codes )
278
+ new_codes = np .concatenate (new_codes )
258
279
else :
259
- # to show a proper error message
280
+ # ordered - to show a proper error message
260
281
if all (c .ordered for c in to_union ):
261
282
msg = ("to union ordered Categoricals, "
262
283
"all categories must be the same" )
263
284
raise TypeError (msg )
264
285
else :
265
286
raise TypeError ('Categorical.ordered must be the same' )
266
287
267
- cats = first .categories
268
- unique_cats = cats .append ([c .categories for c in to_union [1 :]]).unique ()
269
- categories = Index (unique_cats )
270
-
271
- new_codes = []
272
- for c in to_union :
273
- if len (c .categories ) > 0 :
274
- indexer = categories .get_indexer (c .categories )
275
- new_codes .append (take_1d (indexer , c .codes , fill_value = - 1 ))
276
- else :
277
- # must be all NaN
278
- new_codes .append (c .codes )
279
-
280
- new_codes = np .concatenate (new_codes )
281
- return Categorical (new_codes , categories = categories , ordered = False ,
288
+ return Categorical (new_codes , categories = categories , ordered = ordered ,
282
289
fastpath = True )
283
290
284
291
0 commit comments