Skip to content

Commit 7b37c34

Browse files
committed
cleanup impl, add asv
1 parent ccaeb76 commit 7b37c34

File tree

5 files changed

+56
-235
lines changed

5 files changed

+56
-235
lines changed

asv_bench/benchmarks/categoricals.py

+15
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
from .pandas_vb_common import *
2+
try:
3+
from pandas.types.concat import union_categoricals
4+
except ImportError:
5+
pass
26
import string
37

48

@@ -12,6 +16,17 @@ def time_concat_categorical(self):
1216
concat([self.s, self.s])
1317

1418

19+
class union_categorical(object):
20+
goal_time = 0.2
21+
22+
def setup(self):
23+
self.a = pd.Categorical((list('aabbcd') * 1000000))
24+
self.b = pd.Categorical((list('bbcdjk') * 1000000))
25+
26+
def time_union_categorical(self):
27+
union_categoricals([self.a, self.b])
28+
29+
1530
class categorical_value_counts(object):
1631
goal_time = 1
1732

pandas/core/algorithms.py

-34
Original file line numberDiff line numberDiff line change
@@ -573,34 +573,6 @@ def select_n(series, n, keep, method):
573573
return dropped.iloc[inds]
574574

575575

576-
def union_categoricals(to_concat):
577-
"""
578-
Combine list-like of Categoricals, unioning categories. All
579-
must have the same dtype, and none can be ordered.
580-
581-
Makes no guarantee about the ordering of the new categories
582-
"""
583-
from pandas.core.categorical import Categorical
584-
585-
if any(c.ordered for c in to_concat):
586-
raise TypeError("Can only combine unordered Categoricals")
587-
588-
first = to_concat[0]
589-
if not all(com.is_dtype_equal(c.categories, first.categories)
590-
for c in to_concat):
591-
raise TypeError("dtype of categories must be the same")
592-
593-
new_size = sum(len(c.codes) for c in to_concat)
594-
recode_size = max(len(c.codes) for c in to_concat)
595-
codes = [com._ensure_int64(c.codes) for c in to_concat]
596-
597-
algo_getter = lambda x: _get_data_algo(x.categories, _categorical_combiner)
598-
f, _ = algo_getter(first)
599-
categories = [algo_getter(c)[1] for c in to_concat]
600-
new_codes, new_categories = f(codes, categories, new_size, recode_size)
601-
return Categorical.from_codes(new_codes, new_categories)
602-
603-
604576
def _finalize_nsmallest(arr, kth_val, n, keep, narr):
605577
ns, = np.nonzero(arr <= kth_val)
606578
inds = ns[arr[ns].argsort(kind='mergesort')][:n]
@@ -640,12 +612,6 @@ def _hashtable_algo(f, dtype, return_dtype=None):
640612
'generic': (htable.PyObjectHashTable, htable.ObjectVector)
641613
}
642614

643-
_categorical_combiner = {
644-
'float64': htable.recategorize_float64,
645-
'int64': htable.recategorize_int64,
646-
'generic': htable.recategorize_object
647-
}
648-
649615

650616
def _get_data_algo(values, func_map):
651617
if com.is_float_dtype(values):

pandas/hashtable.pyx

-200
Original file line numberDiff line numberDiff line change
@@ -1114,206 +1114,6 @@ def duplicated_int64(ndarray[int64_t, ndim=1] values, object keep='first'):
11141114
kh_destroy_int64(table)
11151115
return out
11161116

1117-
@cython.wraparound(False)
1118-
@cython.boundscheck(False)
1119-
def recategorize_int64(list codes, list cats, int N, int recode_size):
1120-
cdef:
1121-
kh_int64_t *table = kh_init_int64()
1122-
int64_t[:] new_codes = np.empty(N, dtype='int64')
1123-
int64_t[:] recode = np.empty(recode_size, dtype='int64')
1124-
int64_t[:] current_codes
1125-
int64_t[:] new_categories, current_categories
1126-
Py_ssize_t cat_id, j, n_codes, n_cats, i = 0
1127-
int ret = 0
1128-
int64_t current_code = 0
1129-
khiter_t k
1130-
1131-
for cat_id in range(len(codes)):
1132-
current_codes = codes[cat_id]
1133-
current_categories = cats[cat_id]
1134-
1135-
with nogil:
1136-
n_cats = current_categories.shape[0]
1137-
n_codes = current_codes.shape[0]
1138-
if cat_id == 0:
1139-
kh_resize_int64(table, n_cats)
1140-
# first pass dump directly in to table since uniqueness
1141-
# is guaranteed
1142-
for j in range(n_cats):
1143-
k = kh_put_int64(table, current_categories[j], &ret)
1144-
table.vals[k] = current_code
1145-
current_code += 1
1146-
# reuse codes
1147-
for j in range(n_codes):
1148-
new_codes[i] = current_codes[j]
1149-
i += 1
1150-
else:
1151-
for j in range(n_cats):
1152-
k = kh_get_int64(table, current_categories[j])
1153-
1154-
# if a new category, add to the master hash table
1155-
if k == table.n_buckets:
1156-
k = kh_put_int64(table, current_categories[j], &ret)
1157-
table.vals[k] = current_code
1158-
current_code += 1
1159-
# add to the recode table, mapping from
1160-
# orig catgory -> master_category
1161-
recode[j] = table.vals[k]
1162-
1163-
for j in range(n_codes):
1164-
# continue filing new codes, this pass
1165-
# looking up in recode table
1166-
if current_codes[j] == -1:
1167-
new_codes[i] = -1
1168-
else:
1169-
new_codes[i] = recode[current_codes[j]]
1170-
i += 1
1171-
1172-
# fill in new categories from hash table
1173-
i = 0
1174-
new_categories = np.zeros(table.n_occupied, dtype='int64')
1175-
with nogil:
1176-
for k in range(table.n_buckets):
1177-
if kh_exist_int64(table, k):
1178-
new_categories[i] = table.keys[k]
1179-
i += 1
1180-
kh_destroy_int64(table)
1181-
return np.asarray(new_codes), np.asarray(new_categories)
1182-
1183-
# this could be fused with the int version
1184-
# but no great way to work with hash table
1185-
@cython.wraparound(False)
1186-
@cython.boundscheck(False)
1187-
def recategorize_float64(list codes, list cats, int N, int recode_size):
1188-
cdef:
1189-
kh_float64_t *table = kh_init_float64()
1190-
int64_t[:] new_codes = np.empty(N, dtype='int64')
1191-
int64_t[:] recode = np.empty(recode_size, dtype='int64')
1192-
int64_t[:] current_codes
1193-
float64_t[:] new_categories, current_categories
1194-
Py_ssize_t cat_id, j, n_codes, n_cats, i = 0
1195-
int ret = 0
1196-
int64_t current_code = 0
1197-
khiter_t k
1198-
1199-
for cat_id in range(len(codes)):
1200-
current_codes = codes[cat_id]
1201-
current_categories = cats[cat_id]
1202-
1203-
with nogil:
1204-
n_cats = current_categories.shape[0]
1205-
n_codes = current_codes.shape[0]
1206-
if cat_id == 0:
1207-
# first pass dump directly in, since uniqueness is guaranteed
1208-
# and don't need to recode
1209-
kh_resize_float64(table, n_cats)
1210-
for j in range(n_cats):
1211-
k = kh_put_float64(table, current_categories[j], &ret)
1212-
table.vals[k] = current_code
1213-
current_code += 1
1214-
for j in range(n_codes):
1215-
new_codes[i] = current_codes[j]
1216-
i += 1
1217-
else:
1218-
for j in range(n_cats):
1219-
k = kh_get_float64(table, current_categories[j])
1220-
1221-
# if a new category, add to the master hash table
1222-
if k == table.n_buckets:
1223-
k = kh_put_float64(table, current_categories[j], &ret)
1224-
table.vals[k] = current_code
1225-
current_code += 1
1226-
1227-
# add to the recode table, mapping from
1228-
# orig_catgory -> master_category
1229-
recode[j] = table.vals[k]
1230-
1231-
for j in range(n_codes):
1232-
if current_codes[j] == -1:
1233-
new_codes[i] = -1
1234-
else:
1235-
new_codes[i] = recode[current_codes[j]]
1236-
i += 1
1237-
1238-
# fill in new categories from hash table
1239-
i = 0
1240-
new_categories = np.zeros(table.n_occupied, dtype='float64')
1241-
with nogil:
1242-
for k in range(table.n_buckets):
1243-
if kh_exist_float64(table, k):
1244-
new_categories[i] = table.keys[k]
1245-
i += 1
1246-
kh_destroy_float64(table)
1247-
return np.asarray(new_codes), np.asarray(new_categories)
1248-
1249-
1250-
@cython.wraparound(False)
1251-
@cython.boundscheck(False)
1252-
def recategorize_object(list codes, list cats, int N, int recode_size):
1253-
cdef:
1254-
kh_pymap_t *table = kh_init_pymap()
1255-
int64_t[:] new_codes = np.empty(N, dtype='int64')
1256-
int64_t[:] recode = np.empty(recode_size, dtype='int64')
1257-
int64_t[:] current_codes
1258-
object[:] new_categories, current_categories
1259-
Py_ssize_t cat_id, j, n_codes, n_cats, i = 0
1260-
int ret = 0
1261-
int64_t current_code = 0
1262-
khiter_t k
1263-
1264-
for cat_id in range(len(codes)):
1265-
current_codes = codes[cat_id]
1266-
current_categories = cats[cat_id]
1267-
1268-
n_cats = current_categories.shape[0]
1269-
n_codes = current_codes.shape[0]
1270-
if cat_id == 0:
1271-
kh_resize_pymap(table, n_cats)
1272-
# first pass dump directly in to table since uniqueness
1273-
# is guaranteed and don't need to recode
1274-
for j in range(n_cats):
1275-
k = kh_put_pymap(table, <PyObject *>current_categories[j], &ret)
1276-
table.vals[k] = current_code
1277-
current_code += 1
1278-
with nogil:
1279-
# reuse codes
1280-
for j in range(n_codes):
1281-
new_codes[i] = current_codes[j]
1282-
i += 1
1283-
else:
1284-
for j in range(n_cats):
1285-
k = kh_get_pymap(table, <PyObject*>current_categories[j])
1286-
1287-
# if a new category, add to the master hash table
1288-
if k == table.n_buckets:
1289-
k = kh_put_pymap(table, <PyObject*>current_categories[j], &ret)
1290-
table.vals[k] = current_code
1291-
current_code += 1
1292-
1293-
# add to the recode table, mapping from
1294-
# orig catgory -> master_category
1295-
recode[j] = table.vals[k]
1296-
1297-
with nogil:
1298-
for j in range(n_codes):
1299-
# continue filing new codes, this pass
1300-
# looking up in recode table
1301-
if current_codes[j] == -1:
1302-
new_codes[i] = -1
1303-
else:
1304-
new_codes[i] = recode[current_codes[j]]
1305-
i += 1
1306-
1307-
# fill in new categories from hash table
1308-
i = 0
1309-
new_categories = np.zeros(table.n_occupied, dtype='object')
1310-
for k in range(table.n_buckets):
1311-
if kh_exist_pymap(table, k):
1312-
new_categories[i] = <object>table.keys[k]
1313-
i += 1
1314-
kh_destroy_pymap(table)
1315-
return np.asarray(new_codes), np.asarray(new_categories)
1316-
13171117

13181118
@cython.wraparound(False)
13191119
@cython.boundscheck(False)

pandas/tests/test_categorical.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3944,7 +3944,7 @@ def f():
39443944
tm.assert_frame_equal(result, expected)
39453945

39463946
def test_union(self):
3947-
from pandas.core.algorithms import union_categoricals
3947+
from pandas.types.concat import union_categoricals
39483948

39493949
s = Categorical(list('abc'))
39503950
s2 = Categorical(list('abd'))

pandas/types/concat.py

+40
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,46 @@ def convert_categorical(x):
201201
return Categorical(concatted, rawcats)
202202

203203

204+
def union_categoricals(to_union):
205+
"""
206+
Combine list-like of Categoricals, unioning categories. All
207+
must have the same dtype, and none can be ordered.
208+
209+
Parameters
210+
----------
211+
to_union : list like of Categorical
212+
213+
Returns
214+
-------
215+
Categorical
216+
A single array, categories will be ordered as they
217+
appear in the list
218+
"""
219+
from pandas import Index, Categorical
220+
221+
if any(c.ordered for c in to_union):
222+
raise TypeError("Can only combine unordered Categoricals")
223+
224+
first = to_union[0]
225+
if not all(com.is_dtype_equal(c.categories, first.categories)
226+
for c in to_union):
227+
raise TypeError("dtype of categories must be the same")
228+
229+
for i, c in enumerate(to_union):
230+
if i == 0:
231+
cats = c.categories.tolist()
232+
else:
233+
cats = cats + c.categories.difference(Index(cats)).tolist()
234+
235+
cats = Index(cats)
236+
new_codes = []
237+
for c in to_union:
238+
indexer = cats.get_indexer(c.categories)
239+
new_codes.append(indexer.take(c.codes))
240+
codes = np.concatenate(new_codes)
241+
return Categorical.from_codes(codes, cats)
242+
243+
204244
def _concat_datetime(to_concat, axis=0, typs=None):
205245
"""
206246
provide concatenation of an datetimelike array of arrays each of which is a

0 commit comments

Comments
 (0)