cleanup impl, add asv

chris-b1 · chris-b1 · commit 7b37c34ba2cd · 2016-06-04T16:45:53.000-05:00
diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
@@ -1,4 +1,8 @@
 from .pandas_vb_common import *
+try:
+    from pandas.types.concat import union_categoricals
+except ImportError:
+    pass
 import string
 
 
@@ -12,6 +16,17 @@ def time_concat_categorical(self):
         concat([self.s, self.s])
 
 
+class union_categorical(object):
+    goal_time = 0.2
+
+    def setup(self):
+        self.a = pd.Categorical((list('aabbcd') * 1000000))
+        self.b = pd.Categorical((list('bbcdjk') * 1000000))
+
+    def time_union_categorical(self):
+        union_categoricals([self.a, self.b])
+
+
 class categorical_value_counts(object):
     goal_time = 1
 
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -573,34 +573,6 @@ def select_n(series, n, keep, method):
     return dropped.iloc[inds]
 
 
-def union_categoricals(to_concat):
-    """
-    Combine list-like of Categoricals, unioning categories. All
-    must have the same dtype, and none can be ordered.
-
-    Makes no guarantee about the ordering of the new categories
-    """
-    from pandas.core.categorical import Categorical
-
-    if any(c.ordered for c in to_concat):
-        raise TypeError("Can only combine unordered Categoricals")
-
-    first = to_concat[0]
-    if not all(com.is_dtype_equal(c.categories, first.categories)
-               for c in to_concat):
-        raise TypeError("dtype of categories must be the same")
-
-    new_size = sum(len(c.codes) for c in to_concat)
-    recode_size = max(len(c.codes) for c in to_concat)
-    codes = [com._ensure_int64(c.codes) for c in to_concat]
-
-    algo_getter = lambda x: _get_data_algo(x.categories, _categorical_combiner)
-    f, _ = algo_getter(first)
-    categories = [algo_getter(c)[1] for c in to_concat]
-    new_codes, new_categories = f(codes, categories, new_size, recode_size)
-    return Categorical.from_codes(new_codes, new_categories)
-
-
 def _finalize_nsmallest(arr, kth_val, n, keep, narr):
     ns, = np.nonzero(arr <= kth_val)
     inds = ns[arr[ns].argsort(kind='mergesort')][:n]
@@ -640,12 +612,6 @@ def _hashtable_algo(f, dtype, return_dtype=None):
     'generic': (htable.PyObjectHashTable, htable.ObjectVector)
 }
 
-_categorical_combiner = {
-    'float64': htable.recategorize_float64,
-    'int64': htable.recategorize_int64,
-    'generic': htable.recategorize_object
-}
-
 
 def _get_data_algo(values, func_map):
     if com.is_float_dtype(values):
diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx
@@ -1114,206 +1114,6 @@ def duplicated_int64(ndarray[int64_t, ndim=1] values, object keep='first'):
     kh_destroy_int64(table)
     return out
 
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def recategorize_int64(list codes, list cats, int N, int recode_size):
-    cdef:
-        kh_int64_t *table = kh_init_int64()
-        int64_t[:] new_codes = np.empty(N, dtype='int64')
-        int64_t[:] recode = np.empty(recode_size, dtype='int64')
-        int64_t[:] current_codes
-        int64_t[:] new_categories, current_categories
-        Py_ssize_t cat_id, j, n_codes, n_cats, i = 0
-        int ret = 0
-        int64_t current_code = 0
-        khiter_t k
-
-    for cat_id in range(len(codes)):
-        current_codes = codes[cat_id]
-        current_categories = cats[cat_id]
-
-        with nogil:
-            n_cats = current_categories.shape[0]
-            n_codes = current_codes.shape[0]
-            if cat_id == 0:
-                kh_resize_int64(table, n_cats)
-                # first pass dump directly in to table since uniqueness
-                # is guaranteed
-                for j in range(n_cats):
-                    k = kh_put_int64(table, current_categories[j], &ret)
-                    table.vals[k] = current_code
-                    current_code += 1
-                # reuse codes
-                for j in range(n_codes):
-                    new_codes[i] = current_codes[j]
-                    i += 1
-            else:
-                for j in range(n_cats):
-                    k = kh_get_int64(table, current_categories[j])
-
-                    # if a new category, add to the master hash table
-                    if k == table.n_buckets:
-                        k = kh_put_int64(table, current_categories[j], &ret)
-                        table.vals[k] = current_code
-                        current_code += 1
-                    # add to the recode table, mapping from
-                    # orig catgory -> master_category
-                    recode[j] = table.vals[k]
-
-                for j in range(n_codes):
-                    # continue filing new codes, this pass
-                    # looking up in recode table
-                    if current_codes[j] == -1:
-                        new_codes[i] = -1
-                    else:
-                        new_codes[i] = recode[current_codes[j]]
-                    i += 1
-
-    # fill in new categories from hash table
-    i = 0
-    new_categories = np.zeros(table.n_occupied, dtype='int64')
-    with nogil:
-        for k in range(table.n_buckets):
-            if kh_exist_int64(table, k):
-                new_categories[i] = table.keys[k]
-                i += 1
-        kh_destroy_int64(table)
-    return np.asarray(new_codes), np.asarray(new_categories)
-
-# this could be fused with the int version
-# but no great way to work with hash table
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def recategorize_float64(list codes, list cats, int N, int recode_size):
-    cdef:
-        kh_float64_t *table = kh_init_float64()
-        int64_t[:] new_codes = np.empty(N, dtype='int64')
-        int64_t[:] recode = np.empty(recode_size, dtype='int64')
-        int64_t[:] current_codes
-        float64_t[:] new_categories, current_categories
-        Py_ssize_t cat_id, j, n_codes, n_cats, i = 0
-        int ret = 0
-        int64_t current_code = 0
-        khiter_t k
-
-    for cat_id in range(len(codes)):
-        current_codes = codes[cat_id]
-        current_categories = cats[cat_id]
-
-        with nogil:
-            n_cats = current_categories.shape[0]
-            n_codes = current_codes.shape[0]
-            if cat_id == 0:
-                # first pass dump directly in, since uniqueness is guaranteed
-                # and don't need to recode
-                kh_resize_float64(table, n_cats)
-                for j in range(n_cats):
-                    k = kh_put_float64(table, current_categories[j], &ret)
-                    table.vals[k] = current_code
-                    current_code += 1
-                for j in range(n_codes):
-                    new_codes[i] = current_codes[j]
-                    i += 1
-            else:
-                for j in range(n_cats):
-                    k = kh_get_float64(table, current_categories[j])
-
-                    # if a new category, add to the master hash table
-                    if k == table.n_buckets:
-                        k = kh_put_float64(table, current_categories[j], &ret)
-                        table.vals[k] = current_code
-                        current_code += 1
-
-                    # add to the recode table, mapping from
-                    # orig_catgory -> master_category
-                    recode[j] = table.vals[k]
-
-                for j in range(n_codes):
-                    if current_codes[j] == -1:
-                        new_codes[i] = -1
-                    else:
-                        new_codes[i] = recode[current_codes[j]]
-                    i += 1
-
-    # fill in new categories from hash table
-    i = 0
-    new_categories = np.zeros(table.n_occupied, dtype='float64')
-    with nogil:
-        for k in range(table.n_buckets):
-            if kh_exist_float64(table, k):
-                new_categories[i] = table.keys[k]
-                i += 1
-        kh_destroy_float64(table)
-    return np.asarray(new_codes), np.asarray(new_categories)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def recategorize_object(list codes, list cats, int N, int recode_size):
-    cdef:
-        kh_pymap_t *table = kh_init_pymap()
-        int64_t[:] new_codes = np.empty(N, dtype='int64')
-        int64_t[:] recode = np.empty(recode_size, dtype='int64')
-        int64_t[:] current_codes
-        object[:] new_categories, current_categories
-        Py_ssize_t cat_id, j, n_codes, n_cats, i = 0
-        int ret = 0
-        int64_t current_code = 0
-        khiter_t k
-
-    for cat_id in range(len(codes)):
-        current_codes = codes[cat_id]
-        current_categories = cats[cat_id]
-
-        n_cats = current_categories.shape[0]
-        n_codes = current_codes.shape[0]
-        if cat_id == 0:
-            kh_resize_pymap(table, n_cats)
-            # first pass dump directly in to table since uniqueness
-            # is guaranteed and don't need to recode
-            for j in range(n_cats):
-                k = kh_put_pymap(table, <PyObject *>current_categories[j], &ret)
-                table.vals[k] = current_code
-                current_code += 1
-            with nogil:
-                # reuse codes
-                for j in range(n_codes):
-                    new_codes[i] = current_codes[j]
-                    i += 1
-        else:
-            for j in range(n_cats):
-                k = kh_get_pymap(table, <PyObject*>current_categories[j])
-
-                # if a new category, add to the master hash table
-                if k == table.n_buckets:
-                    k = kh_put_pymap(table, <PyObject*>current_categories[j], &ret)
-                    table.vals[k] = current_code
-                    current_code += 1
-
-                # add to the recode table, mapping from
-                # orig catgory -> master_category
-                recode[j] = table.vals[k]
-
-            with nogil:
-                for j in range(n_codes):
-                    # continue filing new codes, this pass
-                    # looking up in recode table
-                    if current_codes[j] == -1:
-                        new_codes[i] = -1
-                    else:
-                        new_codes[i] = recode[current_codes[j]]
-                    i += 1
-
-    # fill in new categories from hash table
-    i = 0
-    new_categories = np.zeros(table.n_occupied, dtype='object')
-    for k in range(table.n_buckets):
-        if kh_exist_pymap(table, k):
-            new_categories[i] = <object>table.keys[k]
-            i += 1
-    kh_destroy_pymap(table)
-    return np.asarray(new_codes), np.asarray(new_categories)
-
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -3944,7 +3944,7 @@ def f():
         tm.assert_frame_equal(result, expected)
 
     def test_union(self):
-        from pandas.core.algorithms import union_categoricals
+        from pandas.types.concat import union_categoricals
 
         s = Categorical(list('abc'))
         s2 = Categorical(list('abd'))
diff --git a/pandas/types/concat.py b/pandas/types/concat.py
@@ -201,6 +201,46 @@ def convert_categorical(x):
         return Categorical(concatted, rawcats)
 
 
+def union_categoricals(to_union):
+    """
+    Combine list-like of Categoricals, unioning categories. All
+    must have the same dtype, and none can be ordered.
+
+    Parameters
+    ----------
+    to_union : list like of Categorical
+
+    Returns
+    -------
+    Categorical
+       A single array, categories will be ordered as they
+       appear in the list
+    """
+    from pandas import Index, Categorical
+
+    if any(c.ordered for c in to_union):
+        raise TypeError("Can only combine unordered Categoricals")
+
+    first = to_union[0]
+    if not all(com.is_dtype_equal(c.categories, first.categories)
+               for c in to_union):
+        raise TypeError("dtype of categories must be the same")
+
+    for i, c in enumerate(to_union):
+        if i == 0:
+            cats = c.categories.tolist()
+        else:
+            cats = cats + c.categories.difference(Index(cats)).tolist()
+
+    cats = Index(cats)
+    new_codes = []
+    for c in to_union:
+        indexer = cats.get_indexer(c.categories)
+        new_codes.append(indexer.take(c.codes))
+    codes = np.concatenate(new_codes)
+    return Categorical.from_codes(codes, cats)
+
+
 def _concat_datetime(to_concat, axis=0, typs=None):
     """
     provide concatenation of an datetimelike array of arrays each of which is a