Skip to content

Commit d5bea25

Browse files
chris-b1jreback
authored andcommitted
API/ENH: union Categorical
Author: Chris <[email protected]> Closes #13361 from chris-b1/union-categorical and squashes the following commits: 568784f [Chris] versionadded; empty case 17209f9 [Chris] Doc updates; use Index.append 4499cda [Chris] move tests, adress feedback 77e7963 [Chris] doc notes 7b37c34 [Chris] cleanup impl, add asv ccaeb76 [Chris] API/ENH: union Categorical
1 parent 5407249 commit d5bea25

File tree

6 files changed

+173
-7
lines changed

6 files changed

+173
-7
lines changed

asv_bench/benchmarks/categoricals.py

+15
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
from .pandas_vb_common import *
2+
try:
3+
from pandas.types.concat import union_categoricals
4+
except ImportError:
5+
pass
26
import string
37

48

@@ -12,6 +16,17 @@ def time_concat_categorical(self):
1216
concat([self.s, self.s])
1317

1418

19+
class union_categorical(object):
20+
goal_time = 0.2
21+
22+
def setup(self):
23+
self.a = pd.Categorical((list('aabbcd') * 1000000))
24+
self.b = pd.Categorical((list('bbcdjk') * 1000000))
25+
26+
def time_union_categorical(self):
27+
union_categoricals([self.a, self.b])
28+
29+
1530
class categorical_value_counts(object):
1631
goal_time = 1
1732

doc/source/categorical.rst

+25
Original file line numberDiff line numberDiff line change
@@ -648,6 +648,31 @@ In this case the categories are not the same and so an error is raised:
648648
649649
The same applies to ``df.append(df_different)``.
650650

651+
.. _categorical.union:
652+
653+
Unioning
654+
~~~~~~~~
655+
656+
.. versionadded:: 0.18.2
657+
658+
If you want to combine categoricals that do not necessarily have
659+
the same categories, the `union_categorical` function will
660+
combine a list-like of categoricals. The new categories
661+
will be the union of the categories being combined.
662+
663+
.. ipython:: python
664+
665+
from pandas.types.concat import union_categoricals
666+
a = pd.Categorical(["b", "c"])
667+
b = pd.Categorical(["a", "b"])
668+
union_categoricals([a, b])
669+
670+
.. note::
671+
672+
`union_categoricals` only works with unordered categoricals
673+
and will raise if any are ordered.
674+
675+
651676
Getting Data In/Out
652677
-------------------
653678

doc/source/whatsnew/v0.18.2.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ Other enhancements
9191

9292
- The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`)
9393
- ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`)
94-
94+
- A ``union_categorical`` function has been added for combining categoricals, see :ref:`Unioning Categoricals<categorical.union>` (:issue:`13361`)
9595
- ``eval``'s upcasting rules for ``float32`` types have been updated to be more consistent with NumPy's rules. New behavior will not upcast to ``float64`` if you multiply a pandas ``float32`` object by a scalar float64. (:issue:`12388`)
9696
- ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`)
9797

pandas/tools/tests/test_concat.py

+50-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@
99
from pandas import (DataFrame, concat,
1010
read_csv, isnull, Series, date_range,
1111
Index, Panel, MultiIndex, Timestamp,
12-
DatetimeIndex)
12+
DatetimeIndex, Categorical)
13+
from pandas.types.concat import union_categoricals
1314
from pandas.util import testing as tm
1415
from pandas.util.testing import (assert_frame_equal,
1516
makeCustomDataframe as mkdf,
@@ -919,6 +920,54 @@ def test_concat_keys_with_none(self):
919920
keys=['b', 'c', 'd', 'e'])
920921
tm.assert_frame_equal(result, expected)
921922

923+
def test_union_categorical(self):
924+
# GH 13361
925+
data = [
926+
(list('abc'), list('abd'), list('abcabd')),
927+
([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]),
928+
([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]),
929+
930+
(pd.date_range('2014-01-01', '2014-01-05'),
931+
pd.date_range('2014-01-06', '2014-01-07'),
932+
pd.date_range('2014-01-01', '2014-01-07')),
933+
934+
(pd.date_range('2014-01-01', '2014-01-05', tz='US/Central'),
935+
pd.date_range('2014-01-06', '2014-01-07', tz='US/Central'),
936+
pd.date_range('2014-01-01', '2014-01-07', tz='US/Central')),
937+
938+
(pd.period_range('2014-01-01', '2014-01-05'),
939+
pd.period_range('2014-01-06', '2014-01-07'),
940+
pd.period_range('2014-01-01', '2014-01-07')),
941+
]
942+
943+
for a, b, combined in data:
944+
result = union_categoricals([Categorical(a), Categorical(b)])
945+
expected = Categorical(combined)
946+
tm.assert_categorical_equal(result, expected,
947+
check_category_order=True)
948+
949+
# new categories ordered by appearance
950+
s = Categorical(['x', 'y', 'z'])
951+
s2 = Categorical(['a', 'b', 'c'])
952+
result = union_categoricals([s, s2]).categories
953+
expected = Index(['x', 'y', 'z', 'a', 'b', 'c'])
954+
tm.assert_index_equal(result, expected)
955+
956+
# can't be ordered
957+
s = Categorical([0, 1.2, 2], ordered=True)
958+
s2 = Categorical([0, 1.2, 2], ordered=True)
959+
with tm.assertRaises(TypeError):
960+
union_categoricals([s, s2])
961+
962+
# must exactly match types
963+
s = Categorical([0, 1.2, 2])
964+
s2 = Categorical([2, 3, 4])
965+
with tm.assertRaises(TypeError):
966+
union_categoricals([s, s2])
967+
968+
with tm.assertRaises(ValueError):
969+
union_categoricals([])
970+
922971
def test_concat_bug_1719(self):
923972
ts1 = tm.makeTimeSeries()
924973
ts2 = tm.makeTimeSeries()[::2]

pandas/types/concat.py

+51
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,57 @@ def convert_categorical(x):
201201
return Categorical(concatted, rawcats)
202202

203203

204+
def union_categoricals(to_union):
205+
"""
206+
Combine list-like of Categoricals, unioning categories. All
207+
must have the same dtype, and none can be ordered.
208+
209+
.. versionadded 0.18.2
210+
211+
Parameters
212+
----------
213+
to_union : list-like of Categoricals
214+
215+
Returns
216+
-------
217+
Categorical
218+
A single array, categories will be ordered as they
219+
appear in the list
220+
221+
Raises
222+
------
223+
TypeError
224+
If any of the categoricals are ordered or all do not
225+
have the same dtype
226+
ValueError
227+
Emmpty list of categoricals passed
228+
"""
229+
from pandas import Index, Categorical
230+
231+
if len(to_union) == 0:
232+
raise ValueError('No Categoricals to union')
233+
234+
first = to_union[0]
235+
if any(c.ordered for c in to_union):
236+
raise TypeError("Can only combine unordered Categoricals")
237+
238+
if not all(com.is_dtype_equal(c.categories.dtype, first.categories.dtype)
239+
for c in to_union):
240+
raise TypeError("dtype of categories must be the same")
241+
242+
cats = first.categories
243+
unique_cats = cats.append([c.categories for c in to_union[1:]]).unique()
244+
categories = Index(unique_cats)
245+
246+
new_codes = []
247+
for c in to_union:
248+
indexer = categories.get_indexer(c.categories)
249+
new_codes.append(indexer.take(c.codes))
250+
codes = np.concatenate(new_codes)
251+
return Categorical(codes, categories=categories, ordered=False,
252+
fastpath=True)
253+
254+
204255
def _concat_datetime(to_concat, axis=0, typs=None):
205256
"""
206257
provide concatenation of an datetimelike array of arrays each of which is a

pandas/util/testing.py

+31-5
Original file line numberDiff line numberDiff line change
@@ -963,14 +963,40 @@ def assertNotIsInstance(obj, cls, msg=''):
963963

964964

965965
def assert_categorical_equal(left, right, check_dtype=True,
966-
obj='Categorical'):
966+
obj='Categorical', check_category_order=True):
967+
"""Test that categoricals are eqivalent
968+
969+
Parameters
970+
----------
971+
left, right : Categorical
972+
Categoricals to compare
973+
check_dtype : bool, default True
974+
Check that integer dtype of the codes are the same
975+
obj : str, default 'Categorical'
976+
Specify object name being compared, internally used to show appropriate
977+
assertion message
978+
check_category_order : bool, default True
979+
Whether the order of the categories should be compared, which
980+
implies identical integer codes. If False, only the resulting
981+
values are compared. The ordered attribute is
982+
checked regardless.
983+
"""
967984
assertIsInstance(left, pd.Categorical, '[Categorical] ')
968985
assertIsInstance(right, pd.Categorical, '[Categorical] ')
969986

970-
assert_index_equal(left.categories, right.categories,
971-
obj='{0}.categories'.format(obj))
972-
assert_numpy_array_equal(left.codes, right.codes, check_dtype=check_dtype,
973-
obj='{0}.codes'.format(obj))
987+
if check_category_order:
988+
assert_index_equal(left.categories, right.categories,
989+
obj='{0}.categories'.format(obj))
990+
assert_numpy_array_equal(left.codes, right.codes,
991+
check_dtype=check_dtype,
992+
obj='{0}.codes'.format(obj))
993+
else:
994+
assert_index_equal(left.categories.sort_values(),
995+
right.categories.sort_values(),
996+
obj='{0}.categories'.format(obj))
997+
assert_index_equal(left.categories.take(left.codes),
998+
right.categories.take(right.codes),
999+
obj='{0}.values'.format(obj))
9741000

9751001
assert_attr_equal('ordered', left, right, obj=obj)
9761002

0 commit comments

Comments
 (0)