API/ENH: union Categorical

chris-b1 · chris-b1 · commit ccaeb76fbd6a · 2016-06-04T06:52:29.000-05:00
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -573,6 +573,34 @@ def select_n(series, n, keep, method):
     return dropped.iloc[inds]
 
 
+def union_categoricals(to_concat):
+    """
+    Combine list-like of Categoricals, unioning categories. All
+    must have the same dtype, and none can be ordered.
+
+    Makes no guarantee about the ordering of the new categories
+    """
+    from pandas.core.categorical import Categorical
+
+    if any(c.ordered for c in to_concat):
+        raise TypeError("Can only combine unordered Categoricals")
+
+    first = to_concat[0]
+    if not all(com.is_dtype_equal(c.categories, first.categories)
+               for c in to_concat):
+        raise TypeError("dtype of categories must be the same")
+
+    new_size = sum(len(c.codes) for c in to_concat)
+    recode_size = max(len(c.codes) for c in to_concat)
+    codes = [com._ensure_int64(c.codes) for c in to_concat]
+
+    algo_getter = lambda x: _get_data_algo(x.categories, _categorical_combiner)
+    f, _ = algo_getter(first)
+    categories = [algo_getter(c)[1] for c in to_concat]
+    new_codes, new_categories = f(codes, categories, new_size, recode_size)
+    return Categorical.from_codes(new_codes, new_categories)
+
+
 def _finalize_nsmallest(arr, kth_val, n, keep, narr):
     ns, = np.nonzero(arr <= kth_val)
     inds = ns[arr[ns].argsort(kind='mergesort')][:n]
@@ -612,6 +640,12 @@ def _hashtable_algo(f, dtype, return_dtype=None):
     'generic': (htable.PyObjectHashTable, htable.ObjectVector)
 }
 
+_categorical_combiner = {
+    'float64': htable.recategorize_float64,
+    'int64': htable.recategorize_int64,
+    'generic': htable.recategorize_object
+}
+
 
 def _get_data_algo(values, func_map):
     if com.is_float_dtype(values):
diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx
@@ -1114,6 +1114,206 @@ def duplicated_int64(ndarray[int64_t, ndim=1] values, object keep='first'):
     kh_destroy_int64(table)
     return out
 
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def recategorize_int64(list codes, list cats, int N, int recode_size):
+    cdef:
+        kh_int64_t *table = kh_init_int64()
+        int64_t[:] new_codes = np.empty(N, dtype='int64')
+        int64_t[:] recode = np.empty(recode_size, dtype='int64')
+        int64_t[:] current_codes
+        int64_t[:] new_categories, current_categories
+        Py_ssize_t cat_id, j, n_codes, n_cats, i = 0
+        int ret = 0
+        int64_t current_code = 0
+        khiter_t k
+
+    for cat_id in range(len(codes)):
+        current_codes = codes[cat_id]
+        current_categories = cats[cat_id]
+
+        with nogil:
+            n_cats = current_categories.shape[0]
+            n_codes = current_codes.shape[0]
+            if cat_id == 0:
+                kh_resize_int64(table, n_cats)
+                # first pass dump directly in to table since uniqueness
+                # is guaranteed
+                for j in range(n_cats):
+                    k = kh_put_int64(table, current_categories[j], &ret)
+                    table.vals[k] = current_code
+                    current_code += 1
+                # reuse codes
+                for j in range(n_codes):
+                    new_codes[i] = current_codes[j]
+                    i += 1
+            else:
+                for j in range(n_cats):
+                    k = kh_get_int64(table, current_categories[j])
+
+                    # if a new category, add to the master hash table
+                    if k == table.n_buckets:
+                        k = kh_put_int64(table, current_categories[j], &ret)
+                        table.vals[k] = current_code
+                        current_code += 1
+                    # add to the recode table, mapping from
+                    # orig catgory -> master_category
+                    recode[j] = table.vals[k]
+
+                for j in range(n_codes):
+                    # continue filing new codes, this pass
+                    # looking up in recode table
+                    if current_codes[j] == -1:
+                        new_codes[i] = -1
+                    else:
+                        new_codes[i] = recode[current_codes[j]]
+                    i += 1
+
+    # fill in new categories from hash table
+    i = 0
+    new_categories = np.zeros(table.n_occupied, dtype='int64')
+    with nogil:
+        for k in range(table.n_buckets):
+            if kh_exist_int64(table, k):
+                new_categories[i] = table.keys[k]
+                i += 1
+        kh_destroy_int64(table)
+    return np.asarray(new_codes), np.asarray(new_categories)
+
+# this could be fused with the int version
+# but no great way to work with hash table
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def recategorize_float64(list codes, list cats, int N, int recode_size):
+    cdef:
+        kh_float64_t *table = kh_init_float64()
+        int64_t[:] new_codes = np.empty(N, dtype='int64')
+        int64_t[:] recode = np.empty(recode_size, dtype='int64')
+        int64_t[:] current_codes
+        float64_t[:] new_categories, current_categories
+        Py_ssize_t cat_id, j, n_codes, n_cats, i = 0
+        int ret = 0
+        int64_t current_code = 0
+        khiter_t k
+
+    for cat_id in range(len(codes)):
+        current_codes = codes[cat_id]
+        current_categories = cats[cat_id]
+
+        with nogil:
+            n_cats = current_categories.shape[0]
+            n_codes = current_codes.shape[0]
+            if cat_id == 0:
+                # first pass dump directly in, since uniqueness is guaranteed
+                # and don't need to recode
+                kh_resize_float64(table, n_cats)
+                for j in range(n_cats):
+                    k = kh_put_float64(table, current_categories[j], &ret)
+                    table.vals[k] = current_code
+                    current_code += 1
+                for j in range(n_codes):
+                    new_codes[i] = current_codes[j]
+                    i += 1
+            else:
+                for j in range(n_cats):
+                    k = kh_get_float64(table, current_categories[j])
+
+                    # if a new category, add to the master hash table
+                    if k == table.n_buckets:
+                        k = kh_put_float64(table, current_categories[j], &ret)
+                        table.vals[k] = current_code
+                        current_code += 1
+
+                    # add to the recode table, mapping from
+                    # orig_catgory -> master_category
+                    recode[j] = table.vals[k]
+
+                for j in range(n_codes):
+                    if current_codes[j] == -1:
+                        new_codes[i] = -1
+                    else:
+                        new_codes[i] = recode[current_codes[j]]
+                    i += 1
+
+    # fill in new categories from hash table
+    i = 0
+    new_categories = np.zeros(table.n_occupied, dtype='float64')
+    with nogil:
+        for k in range(table.n_buckets):
+            if kh_exist_float64(table, k):
+                new_categories[i] = table.keys[k]
+                i += 1
+        kh_destroy_float64(table)
+    return np.asarray(new_codes), np.asarray(new_categories)
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def recategorize_object(list codes, list cats, int N, int recode_size):
+    cdef:
+        kh_pymap_t *table = kh_init_pymap()
+        int64_t[:] new_codes = np.empty(N, dtype='int64')
+        int64_t[:] recode = np.empty(recode_size, dtype='int64')
+        int64_t[:] current_codes
+        object[:] new_categories, current_categories
+        Py_ssize_t cat_id, j, n_codes, n_cats, i = 0
+        int ret = 0
+        int64_t current_code = 0
+        khiter_t k
+
+    for cat_id in range(len(codes)):
+        current_codes = codes[cat_id]
+        current_categories = cats[cat_id]
+
+        n_cats = current_categories.shape[0]
+        n_codes = current_codes.shape[0]
+        if cat_id == 0:
+            kh_resize_pymap(table, n_cats)
+            # first pass dump directly in to table since uniqueness
+            # is guaranteed and don't need to recode
+            for j in range(n_cats):
+                k = kh_put_pymap(table, <PyObject *>current_categories[j], &ret)
+                table.vals[k] = current_code
+                current_code += 1
+            with nogil:
+                # reuse codes
+                for j in range(n_codes):
+                    new_codes[i] = current_codes[j]
+                    i += 1
+        else:
+            for j in range(n_cats):
+                k = kh_get_pymap(table, <PyObject*>current_categories[j])
+
+                # if a new category, add to the master hash table
+                if k == table.n_buckets:
+                    k = kh_put_pymap(table, <PyObject*>current_categories[j], &ret)
+                    table.vals[k] = current_code
+                    current_code += 1
+
+                # add to the recode table, mapping from
+                # orig catgory -> master_category
+                recode[j] = table.vals[k]
+
+            with nogil:
+                for j in range(n_codes):
+                    # continue filing new codes, this pass
+                    # looking up in recode table
+                    if current_codes[j] == -1:
+                        new_codes[i] = -1
+                    else:
+                        new_codes[i] = recode[current_codes[j]]
+                    i += 1
+
+    # fill in new categories from hash table
+    i = 0
+    new_categories = np.zeros(table.n_occupied, dtype='object')
+    for k in range(table.n_buckets):
+        if kh_exist_pymap(table, k):
+            new_categories[i] = <object>table.keys[k]
+            i += 1
+    kh_destroy_pymap(table)
+    return np.asarray(new_codes), np.asarray(new_categories)
+
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -3943,6 +3943,38 @@ def f():
                                   'category', categories=list('cab'))})
         tm.assert_frame_equal(result, expected)
 
+    def test_union(self):
+        from pandas.core.algorithms import union_categoricals
+
+        s = Categorical(list('abc'))
+        s2 = Categorical(list('abd'))
+        result = union_categoricals([s, s2])
+        expected = Categorical(list('abcabd'))
+        tm.assert_categorical_equal(result, expected, ignore_order=True)
+
+        s = Categorical([0,1,2])
+        s2 = Categorical([2,3,4])
+        result = union_categoricals([s, s2])
+        expected = Categorical([0,1,2,2,3,4])
+        tm.assert_categorical_equal(result, expected, ignore_order=True)
+
+        s = Categorical([0,1.2,2])
+        s2 = Categorical([2,3.4,4])
+        result = union_categoricals([s, s2])
+        expected = Categorical([0,1.2,2,2,3.4,4])
+        tm.assert_categorical_equal(result, expected, ignore_order=True)
+
+        # can't be ordered
+        s = Categorical([0,1.2,2], ordered=True)
+        with tm.assertRaises(TypeError):
+            union_categoricals([s, s2])
+
+        # must exactly match types
+        s = Categorical([0,1.2,2])
+        s2 = Categorical([2,3,4])
+        with tm.assertRaises(TypeError):
+            union_categoricals([s, s2])
+
     def test_categorical_index_preserver(self):
 
         a = Series(np.arange(6, dtype='int64'))
diff --git a/pandas/util/testing.py b/pandas/util/testing.py
@@ -963,12 +963,17 @@ def assertNotIsInstance(obj, cls, msg=''):
 
 
 def assert_categorical_equal(left, right, check_dtype=True,
-                             obj='Categorical'):
+                             obj='Categorical', ignore_order=False):
     assertIsInstance(left, pd.Categorical, '[Categorical] ')
     assertIsInstance(right, pd.Categorical, '[Categorical] ')
 
-    assert_index_equal(left.categories, right.categories,
-                       obj='{0}.categories'.format(obj))
+    if ignore_order:
+        assert_index_equal(left.categories.sort_values(),
+                           right.categories.sort_values(),
+                           obj='{0}.categories'.format(obj))
+    else:
+        assert_index_equal(left.categories, right.categories,
+                           obj='{0}.categories'.format(obj))
     assert_numpy_array_equal(left.codes, right.codes, check_dtype=check_dtype,
                              obj='{0}.codes'.format(obj))