Skip to content

Commit ccaeb76

Browse files
committed
API/ENH: union Categorical
1 parent 103f7d3 commit ccaeb76

File tree

4 files changed

+274
-3
lines changed

4 files changed

+274
-3
lines changed

pandas/core/algorithms.py

+34
Original file line numberDiff line numberDiff line change
@@ -573,6 +573,34 @@ def select_n(series, n, keep, method):
573573
return dropped.iloc[inds]
574574

575575

576+
def union_categoricals(to_concat):
577+
"""
578+
Combine list-like of Categoricals, unioning categories. All
579+
must have the same dtype, and none can be ordered.
580+
581+
Makes no guarantee about the ordering of the new categories
582+
"""
583+
from pandas.core.categorical import Categorical
584+
585+
if any(c.ordered for c in to_concat):
586+
raise TypeError("Can only combine unordered Categoricals")
587+
588+
first = to_concat[0]
589+
if not all(com.is_dtype_equal(c.categories, first.categories)
590+
for c in to_concat):
591+
raise TypeError("dtype of categories must be the same")
592+
593+
new_size = sum(len(c.codes) for c in to_concat)
594+
recode_size = max(len(c.codes) for c in to_concat)
595+
codes = [com._ensure_int64(c.codes) for c in to_concat]
596+
597+
algo_getter = lambda x: _get_data_algo(x.categories, _categorical_combiner)
598+
f, _ = algo_getter(first)
599+
categories = [algo_getter(c)[1] for c in to_concat]
600+
new_codes, new_categories = f(codes, categories, new_size, recode_size)
601+
return Categorical.from_codes(new_codes, new_categories)
602+
603+
576604
def _finalize_nsmallest(arr, kth_val, n, keep, narr):
577605
ns, = np.nonzero(arr <= kth_val)
578606
inds = ns[arr[ns].argsort(kind='mergesort')][:n]
@@ -612,6 +640,12 @@ def _hashtable_algo(f, dtype, return_dtype=None):
612640
'generic': (htable.PyObjectHashTable, htable.ObjectVector)
613641
}
614642

643+
_categorical_combiner = {
644+
'float64': htable.recategorize_float64,
645+
'int64': htable.recategorize_int64,
646+
'generic': htable.recategorize_object
647+
}
648+
615649

616650
def _get_data_algo(values, func_map):
617651
if com.is_float_dtype(values):

pandas/hashtable.pyx

+200
Original file line numberDiff line numberDiff line change
@@ -1114,6 +1114,206 @@ def duplicated_int64(ndarray[int64_t, ndim=1] values, object keep='first'):
11141114
kh_destroy_int64(table)
11151115
return out
11161116

1117+
@cython.wraparound(False)
1118+
@cython.boundscheck(False)
1119+
def recategorize_int64(list codes, list cats, int N, int recode_size):
1120+
cdef:
1121+
kh_int64_t *table = kh_init_int64()
1122+
int64_t[:] new_codes = np.empty(N, dtype='int64')
1123+
int64_t[:] recode = np.empty(recode_size, dtype='int64')
1124+
int64_t[:] current_codes
1125+
int64_t[:] new_categories, current_categories
1126+
Py_ssize_t cat_id, j, n_codes, n_cats, i = 0
1127+
int ret = 0
1128+
int64_t current_code = 0
1129+
khiter_t k
1130+
1131+
for cat_id in range(len(codes)):
1132+
current_codes = codes[cat_id]
1133+
current_categories = cats[cat_id]
1134+
1135+
with nogil:
1136+
n_cats = current_categories.shape[0]
1137+
n_codes = current_codes.shape[0]
1138+
if cat_id == 0:
1139+
kh_resize_int64(table, n_cats)
1140+
# first pass dump directly in to table since uniqueness
1141+
# is guaranteed
1142+
for j in range(n_cats):
1143+
k = kh_put_int64(table, current_categories[j], &ret)
1144+
table.vals[k] = current_code
1145+
current_code += 1
1146+
# reuse codes
1147+
for j in range(n_codes):
1148+
new_codes[i] = current_codes[j]
1149+
i += 1
1150+
else:
1151+
for j in range(n_cats):
1152+
k = kh_get_int64(table, current_categories[j])
1153+
1154+
# if a new category, add to the master hash table
1155+
if k == table.n_buckets:
1156+
k = kh_put_int64(table, current_categories[j], &ret)
1157+
table.vals[k] = current_code
1158+
current_code += 1
1159+
# add to the recode table, mapping from
1160+
# orig catgory -> master_category
1161+
recode[j] = table.vals[k]
1162+
1163+
for j in range(n_codes):
1164+
# continue filing new codes, this pass
1165+
# looking up in recode table
1166+
if current_codes[j] == -1:
1167+
new_codes[i] = -1
1168+
else:
1169+
new_codes[i] = recode[current_codes[j]]
1170+
i += 1
1171+
1172+
# fill in new categories from hash table
1173+
i = 0
1174+
new_categories = np.zeros(table.n_occupied, dtype='int64')
1175+
with nogil:
1176+
for k in range(table.n_buckets):
1177+
if kh_exist_int64(table, k):
1178+
new_categories[i] = table.keys[k]
1179+
i += 1
1180+
kh_destroy_int64(table)
1181+
return np.asarray(new_codes), np.asarray(new_categories)
1182+
1183+
# this could be fused with the int version
1184+
# but no great way to work with hash table
1185+
@cython.wraparound(False)
1186+
@cython.boundscheck(False)
1187+
def recategorize_float64(list codes, list cats, int N, int recode_size):
1188+
cdef:
1189+
kh_float64_t *table = kh_init_float64()
1190+
int64_t[:] new_codes = np.empty(N, dtype='int64')
1191+
int64_t[:] recode = np.empty(recode_size, dtype='int64')
1192+
int64_t[:] current_codes
1193+
float64_t[:] new_categories, current_categories
1194+
Py_ssize_t cat_id, j, n_codes, n_cats, i = 0
1195+
int ret = 0
1196+
int64_t current_code = 0
1197+
khiter_t k
1198+
1199+
for cat_id in range(len(codes)):
1200+
current_codes = codes[cat_id]
1201+
current_categories = cats[cat_id]
1202+
1203+
with nogil:
1204+
n_cats = current_categories.shape[0]
1205+
n_codes = current_codes.shape[0]
1206+
if cat_id == 0:
1207+
# first pass dump directly in, since uniqueness is guaranteed
1208+
# and don't need to recode
1209+
kh_resize_float64(table, n_cats)
1210+
for j in range(n_cats):
1211+
k = kh_put_float64(table, current_categories[j], &ret)
1212+
table.vals[k] = current_code
1213+
current_code += 1
1214+
for j in range(n_codes):
1215+
new_codes[i] = current_codes[j]
1216+
i += 1
1217+
else:
1218+
for j in range(n_cats):
1219+
k = kh_get_float64(table, current_categories[j])
1220+
1221+
# if a new category, add to the master hash table
1222+
if k == table.n_buckets:
1223+
k = kh_put_float64(table, current_categories[j], &ret)
1224+
table.vals[k] = current_code
1225+
current_code += 1
1226+
1227+
# add to the recode table, mapping from
1228+
# orig_catgory -> master_category
1229+
recode[j] = table.vals[k]
1230+
1231+
for j in range(n_codes):
1232+
if current_codes[j] == -1:
1233+
new_codes[i] = -1
1234+
else:
1235+
new_codes[i] = recode[current_codes[j]]
1236+
i += 1
1237+
1238+
# fill in new categories from hash table
1239+
i = 0
1240+
new_categories = np.zeros(table.n_occupied, dtype='float64')
1241+
with nogil:
1242+
for k in range(table.n_buckets):
1243+
if kh_exist_float64(table, k):
1244+
new_categories[i] = table.keys[k]
1245+
i += 1
1246+
kh_destroy_float64(table)
1247+
return np.asarray(new_codes), np.asarray(new_categories)
1248+
1249+
1250+
@cython.wraparound(False)
1251+
@cython.boundscheck(False)
1252+
def recategorize_object(list codes, list cats, int N, int recode_size):
1253+
cdef:
1254+
kh_pymap_t *table = kh_init_pymap()
1255+
int64_t[:] new_codes = np.empty(N, dtype='int64')
1256+
int64_t[:] recode = np.empty(recode_size, dtype='int64')
1257+
int64_t[:] current_codes
1258+
object[:] new_categories, current_categories
1259+
Py_ssize_t cat_id, j, n_codes, n_cats, i = 0
1260+
int ret = 0
1261+
int64_t current_code = 0
1262+
khiter_t k
1263+
1264+
for cat_id in range(len(codes)):
1265+
current_codes = codes[cat_id]
1266+
current_categories = cats[cat_id]
1267+
1268+
n_cats = current_categories.shape[0]
1269+
n_codes = current_codes.shape[0]
1270+
if cat_id == 0:
1271+
kh_resize_pymap(table, n_cats)
1272+
# first pass dump directly in to table since uniqueness
1273+
# is guaranteed and don't need to recode
1274+
for j in range(n_cats):
1275+
k = kh_put_pymap(table, <PyObject *>current_categories[j], &ret)
1276+
table.vals[k] = current_code
1277+
current_code += 1
1278+
with nogil:
1279+
# reuse codes
1280+
for j in range(n_codes):
1281+
new_codes[i] = current_codes[j]
1282+
i += 1
1283+
else:
1284+
for j in range(n_cats):
1285+
k = kh_get_pymap(table, <PyObject*>current_categories[j])
1286+
1287+
# if a new category, add to the master hash table
1288+
if k == table.n_buckets:
1289+
k = kh_put_pymap(table, <PyObject*>current_categories[j], &ret)
1290+
table.vals[k] = current_code
1291+
current_code += 1
1292+
1293+
# add to the recode table, mapping from
1294+
# orig catgory -> master_category
1295+
recode[j] = table.vals[k]
1296+
1297+
with nogil:
1298+
for j in range(n_codes):
1299+
# continue filing new codes, this pass
1300+
# looking up in recode table
1301+
if current_codes[j] == -1:
1302+
new_codes[i] = -1
1303+
else:
1304+
new_codes[i] = recode[current_codes[j]]
1305+
i += 1
1306+
1307+
# fill in new categories from hash table
1308+
i = 0
1309+
new_categories = np.zeros(table.n_occupied, dtype='object')
1310+
for k in range(table.n_buckets):
1311+
if kh_exist_pymap(table, k):
1312+
new_categories[i] = <object>table.keys[k]
1313+
i += 1
1314+
kh_destroy_pymap(table)
1315+
return np.asarray(new_codes), np.asarray(new_categories)
1316+
11171317

11181318
@cython.wraparound(False)
11191319
@cython.boundscheck(False)

pandas/tests/test_categorical.py

+32
Original file line numberDiff line numberDiff line change
@@ -3943,6 +3943,38 @@ def f():
39433943
'category', categories=list('cab'))})
39443944
tm.assert_frame_equal(result, expected)
39453945

3946+
def test_union(self):
3947+
from pandas.core.algorithms import union_categoricals
3948+
3949+
s = Categorical(list('abc'))
3950+
s2 = Categorical(list('abd'))
3951+
result = union_categoricals([s, s2])
3952+
expected = Categorical(list('abcabd'))
3953+
tm.assert_categorical_equal(result, expected, ignore_order=True)
3954+
3955+
s = Categorical([0,1,2])
3956+
s2 = Categorical([2,3,4])
3957+
result = union_categoricals([s, s2])
3958+
expected = Categorical([0,1,2,2,3,4])
3959+
tm.assert_categorical_equal(result, expected, ignore_order=True)
3960+
3961+
s = Categorical([0,1.2,2])
3962+
s2 = Categorical([2,3.4,4])
3963+
result = union_categoricals([s, s2])
3964+
expected = Categorical([0,1.2,2,2,3.4,4])
3965+
tm.assert_categorical_equal(result, expected, ignore_order=True)
3966+
3967+
# can't be ordered
3968+
s = Categorical([0,1.2,2], ordered=True)
3969+
with tm.assertRaises(TypeError):
3970+
union_categoricals([s, s2])
3971+
3972+
# must exactly match types
3973+
s = Categorical([0,1.2,2])
3974+
s2 = Categorical([2,3,4])
3975+
with tm.assertRaises(TypeError):
3976+
union_categoricals([s, s2])
3977+
39463978
def test_categorical_index_preserver(self):
39473979

39483980
a = Series(np.arange(6, dtype='int64'))

pandas/util/testing.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -963,12 +963,17 @@ def assertNotIsInstance(obj, cls, msg=''):
963963

964964

965965
def assert_categorical_equal(left, right, check_dtype=True,
966-
obj='Categorical'):
966+
obj='Categorical', ignore_order=False):
967967
assertIsInstance(left, pd.Categorical, '[Categorical] ')
968968
assertIsInstance(right, pd.Categorical, '[Categorical] ')
969969

970-
assert_index_equal(left.categories, right.categories,
971-
obj='{0}.categories'.format(obj))
970+
if ignore_order:
971+
assert_index_equal(left.categories.sort_values(),
972+
right.categories.sort_values(),
973+
obj='{0}.categories'.format(obj))
974+
else:
975+
assert_index_equal(left.categories, right.categories,
976+
obj='{0}.categories'.format(obj))
972977
assert_numpy_array_equal(left.codes, right.codes, check_dtype=check_dtype,
973978
obj='{0}.codes'.format(obj))
974979

0 commit comments

Comments
 (0)