From 858f54e5e4d5e84f2a45f7c5954cbde9fb6d9438 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Mon, 15 Oct 2018 23:38:34 +0200
Subject: [PATCH 01/17] Unify unique/factorize, remove kwargs (perf); enable
 inverse for unique

---
 pandas/_libs/hashtable.pyx                 |   3 +-
 pandas/_libs/hashtable_class_helper.pxi.in | 438 +++++++++++----------
 pandas/core/algorithms.py                  |   5 +-
 pandas/tests/test_algos.py                 |  22 +-
 4 files changed, 242 insertions(+), 226 deletions(-)

diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx
index 2ced98198afc6..1d5f637a05188 100644
--- a/pandas/_libs/hashtable.pyx
+++ b/pandas/_libs/hashtable.pyx
@@ -124,8 +124,7 @@ cdef class Int64Factorizer:
             uniques.extend(self.uniques.to_array())
             self.uniques = uniques
         labels = self.table.get_labels(values, self.uniques,
-                                       self.count, na_sentinel,
-                                       na_value=na_value)
+                                       self.count, na_sentinel, na_value)
 
         # sort on
         if sort:
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index c061102fbaddc..6052768f69e84 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -357,11 +357,12 @@ cdef class {{name}}HashTable(HashTable):
 
     @cython.boundscheck(False)
     @cython.wraparound(False)
-    def _factorize(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
-                   Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
-                   object na_value=None):
+    def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
+                bint ignore_na, bint return_uniques,
+                bint return_inverse, Py_ssize_t count_prior,
+                Py_ssize_t na_sentinel, object na_value):
         """
-        Calculate unique values and labels (no sorting); ignores all NA-values
+        Calculate unique values and labels (no sorting!)
 
         Parameters
         ----------
@@ -369,19 +370,32 @@ cdef class {{name}}HashTable(HashTable):
             Array of values of which unique will be calculated
         uniques : {{name}}Vector
             Vector into which uniques will be written
-        count_prior : Py_ssize_t, default 0
+        ignore_na : boolean
+            Whether NA-values should be ignored for calculating the uniques. If
+            True, the labels corresponding to missing values will be set to
+            na_sentinel.
+        return_uniques : boolean
+            Whether to return the content of the passed "uniques" vector as an
+            np.ndarray at the end. If False, the vector passed to "uniques"
+            must be explicitly read and transformed by the user.
+        return_inverse : boolean
+            Whether the mapping of the original array values to their location
+            in the vector of uniques should be returned.
+        count_prior : Py_ssize_t
             Number of existing entries in uniques
-        na_sentinel : Py_ssize_t, default -1
+        na_sentinel : Py_ssize_t
             Sentinel value used for all NA-values in inverse
-        na_value : object, default None
+        na_value : object
             Value to identify as missing. If na_value is None, then
-            any value satisfying val!=val are considered missing.
+            any value "val" satisfying val != val is considered missing.
+            If na_value is not None, then _additionally_, any value "val"
+            satisfying val == na_value is considered missing.
 
         Returns
         -------
-        uniques : ndarray[{{dtype}}]
+        uniques : ndarray[{{dtype}}] (if return_uniques)
             Unique values of input, not sorted
-        labels : ndarray[int64]
+        labels : ndarray[int64] (if return_inverse=True)
             The labels from values to uniques
         """
         cdef:
@@ -393,7 +407,8 @@ cdef class {{name}}HashTable(HashTable):
             {{name}}VectorData *ud
             bint use_na_value
 
-        labels = np.empty(n, dtype=np.int64)
+        if return_inverse:
+            labels = np.empty(n, dtype=np.int64)
         ud = uniques.data
         use_na_value = na_value is not None
 
@@ -411,20 +426,16 @@ cdef class {{name}}HashTable(HashTable):
             for i in range(n):
                 val = values[i]
 
-                if val != val or (use_na_value and val == na_value2):
+                if ignore_na and (val != val
+                                  or (use_na_value and val == na_value2)):
                     labels[i] = na_sentinel
                     continue
 
                 k = kh_get_{{dtype}}(self.table, val)
 
-                if k != self.table.n_buckets:
-                    # k falls into a previous bucket
-                    idx = self.table.vals[k]
-                    labels[i] = idx
-                else:
+                if k == self.table.n_buckets:
                     # k hasn't been seen yet
                     k = kh_put_{{dtype}}(self.table, val, &ret)
-                    self.table.vals[k] = count
 
                     if needs_resize(ud):
                         with gil:
@@ -434,23 +445,54 @@ cdef class {{name}}HashTable(HashTable):
                                                  "Vector.resize() needed")
                             uniques.resize()
                     append_data_{{dtype}}(ud, val)
-                    labels[i] = count
-                    count += 1
+                    if return_inverse:
+                        self.table.vals[k] = count
+                        labels[i] = count
+                        count += 1
+                elif return_inverse:
+                    # k falls into a previous bucket
+                    # only relevant in case we need to construct the inverse
+                    idx = self.table.vals[k]
+                    labels[i] = idx
 
-        return np.asarray(labels)
+        if return_uniques and return_inverse:
+            return uniques.to_array(), np.asarray(labels)
+        elif return_uniques:
+            return uniques.to_array()
+        elif return_inverse:
+            return np.asarray(labels)
 
-    def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
-                  object na_value=None):
+    def unique(self, const {{dtype}}_t[:] values, bint return_inverse):
         uniques = {{name}}Vector()
-        labels = self._factorize(values, uniques=uniques,
-                                 na_sentinel=na_sentinel, na_value=na_value)
-        return labels, uniques.to_array()
+        return self._unique(values, uniques,
+                            False,  # ignore_na
+                            True,   # return_uniques
+                            return_inverse,
+                            # the rest are of the parameters are not relevant,
+                            # but we don't use kwargs to avoid cython perf hit
+                            0,      # count_prior
+                            -1,     # na_sentinel
+                            None)   # na_value
+
+    def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel,
+                  object na_value):
+        uniques = {{name}}Vector()
+        # factorize has reversed outputs compared to _unique (see "[::-1]")
+        return self._unique(values, uniques,
+                            True,  # ignore_na
+                            True,  # return_uniques
+                            True,  # return_inverse
+                            0,     # count_prior
+                            na_sentinel, na_value)[::-1]
 
     def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
-                   Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
-                   object na_value=None):
-        return self._factorize(values, uniques, count_prior=count_prior,
-                               na_sentinel=na_sentinel, na_value=na_value)
+                   Py_ssize_t count_prior, Py_ssize_t na_sentinel,
+                   object na_value):
+        return self._unique(values, uniques,
+                            True,   # ignore_na
+                            False,  # return_uniques
+                            True,   # return_inverse
+                            count_prior, na_sentinel, na_value)
 
     @cython.boundscheck(False)
     def get_labels_groupby(self, const {{dtype}}_t[:] values):
@@ -497,44 +539,6 @@ cdef class {{name}}HashTable(HashTable):
 
         return np.asarray(labels), arr_uniques
 
-    @cython.boundscheck(False)
-    @cython.wraparound(False)
-    def unique(self, const {{dtype}}_t[:] values):
-        """
-        Calculate unique values without sorting
-
-        Parameters
-        ----------
-        values : ndarray[{{dtype}}]
-            Array of values of which unique will be calculated
-
-        Returns
-        -------
-        uniques : ndarray[{{dtype}}]
-            Unique values of input, not sorted
-        """
-        cdef:
-            Py_ssize_t i, n = len(values)
-            int ret = 0
-            {{dtype}}_t val
-            khiter_t k
-            {{name}}Vector uniques = {{name}}Vector()
-            {{name}}VectorData *ud
-
-        ud = uniques.data
-
-        with nogil:
-            for i in range(n):
-                val = values[i]
-                k = kh_get_{{dtype}}(self.table, val)
-                if k == self.table.n_buckets:
-                    kh_put_{{dtype}}(self.table, val, &ret)
-                    if needs_resize(ud):
-                        with gil:
-                            uniques.resize()
-                    append_data_{{dtype}}(ud, val)
-        return uniques.to_array()
-
 {{endfor}}
 
 
@@ -614,56 +618,6 @@ cdef class StringHashTable(HashTable):
         free(vecs)
         return labels
 
-    @cython.boundscheck(False)
-    @cython.wraparound(False)
-    def unique(self, ndarray[object] values):
-        """
-        Calculate unique values without sorting
-
-        Parameters
-        ----------
-        values : ndarray[object]
-            Array of values of which unique will be calculated
-
-        Returns
-        -------
-        uniques : ndarray[object]
-            Unique values of input, not sorted
-        """
-        cdef:
-            Py_ssize_t i, count, n = len(values)
-            int64_t[:] uindexer
-            int ret = 0
-            object val
-            ObjectVector uniques
-            khiter_t k
-            const char *v
-            const char **vecs
-
-        vecs = <const char **> malloc(n * sizeof(char *))
-        uindexer = np.empty(n, dtype=np.int64)
-        for i in range(n):
-            val = values[i]
-            v = util.get_c_string(val)
-            vecs[i] = v
-
-        count = 0
-        with nogil:
-            for i in range(n):
-                v = vecs[i]
-                k = kh_get_str(self.table, v)
-                if k == self.table.n_buckets:
-                    kh_put_str(self.table, v, &ret)
-                    uindexer[count] = i
-                    count += 1
-        free(vecs)
-
-        # uniques
-        uniques = ObjectVector()
-        for i in range(count):
-            uniques.append(values[uindexer[i]])
-        return uniques.to_array()
-
     @cython.boundscheck(False)
     def lookup(self, ndarray[object] values):
         cdef:
@@ -727,11 +681,12 @@ cdef class StringHashTable(HashTable):
 
     @cython.boundscheck(False)
     @cython.wraparound(False)
-    def _factorize(self, ndarray[object] values, ObjectVector uniques,
-                   Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
-                   object na_value=None):
+    def _unique(self, ndarray[object] values, ObjectVector uniques,
+                bint ignore_na, bint return_uniques, bint return_inverse,
+                Py_ssize_t count_prior, Py_ssize_t na_sentinel,
+                object na_value):
         """
-        Calculate unique values and labels (no sorting); ignores all NA-values
+        Calculate unique values and labels (no sorting)
 
         Parameters
         ----------
@@ -739,18 +694,32 @@ cdef class StringHashTable(HashTable):
             Array of values of which unique will be calculated
         uniques : ObjectVector
             Vector into which uniques will be written
-        count_prior : Py_ssize_t, default 0
+        ignore_na : boolean
+            Whether NA-values should be ignored for calculating the uniques. If
+            True, the labels corresponding to missing values will be set to
+            na_sentinel.
+        return_uniques : boolean
+            Whether to return the content of the passed "uniques" vector as an
+            np.ndarray at the end. If False, the vector passed to "uniques"
+            must be explicitly read and transformed by the user.
+        return_inverse : boolean
+            Whether the mapping of the original array values to their location
+            in the vector of uniques should be returned.
+        count_prior : Py_ssize_t
             Number of existing entries in uniques
-        na_sentinel : Py_ssize_t, default -1
+        na_sentinel : Py_ssize_t
             Sentinel value used for all NA-values in inverse
-        na_value : object, default None
-            Value to identify as missing
+        na_value : object
+            Value to identify as missing. If na_value is None, then any value
+            that is not a string is considered missing. If na_value is
+            not None, then _additionally_ any value "val" satisfying
+            val == na_value is considered missing.
 
         Returns
         -------
-        uniques : ndarray[object]
+        uniques : ndarray[object] (if return_uniques)
             Unique values of input, not sorted
-        labels : ndarray[int64]
+        labels : ndarray[int64] (if return_inverse)
             The labels from values to uniques
         """
         cdef:
@@ -764,41 +733,46 @@ cdef class StringHashTable(HashTable):
             khiter_t k
             bint use_na_value
 
-        labels = np.zeros(n, dtype=np.int64)
+        if return_inverse:
+            labels = np.zeros(n, dtype=np.int64)
         uindexer = np.empty(n, dtype=np.int64)
         use_na_value = na_value is not None
 
-        # assign pointers and pre-filter out missing
+        # assign pointers and pre-filter out missing (if ignore_na)
         vecs = <const char **> malloc(n * sizeof(char *))
         for i in range(n):
             val = values[i]
 
-            if ((PyUnicode_Check(val) or PyString_Check(val))
-                    and not (use_na_value and val == na_value)):
+            if ignore_na and (not (PyUnicode_Check(val) or PyString_Check(val))
+                              or (use_na_value and val == na_value)):
+                # missing value
+                labels[i] = na_sentinel
+            else:
+                # if ignore_na is False, we also stringify NaN/None/etc.
                 v = util.get_c_string(val)
                 vecs[i] = v
-            else:
-                labels[i] = na_sentinel
 
         # compute
         with nogil:
             for i in range(n):
-                if labels[i] == na_sentinel:
+                if ignore_na and labels[i] == na_sentinel:
                     continue
 
                 v = vecs[i]
                 k = kh_get_str(self.table, v)
-                if k != self.table.n_buckets:
-                    # k falls into a previous bucket
-                    idx = self.table.vals[k]
-                    labels[i] = <int64_t>idx
-                else:
+                if k == self.table.n_buckets:
                     # k hasn't been seen yet
                     k = kh_put_str(self.table, v, &ret)
-                    self.table.vals[k] = count
                     uindexer[count] = i
-                    labels[i] = <int64_t>count
+                    if return_inverse:
+                        self.table.vals[k] = count
+                        labels[i] = <int64_t>count
                     count += 1
+                elif return_inverse:
+                    # k falls into a previous bucket
+                    # only relevant in case we need to construct the inverse
+                    idx = self.table.vals[k]
+                    labels[i] = <int64_t>idx
 
         free(vecs)
 
@@ -806,20 +780,44 @@ cdef class StringHashTable(HashTable):
         for i in range(count):
             uniques.append(values[uindexer[i]])
 
-        return np.asarray(labels)
+        if return_uniques and return_inverse:
+            return uniques.to_array(), np.asarray(labels)
+        elif return_uniques:
+            return uniques.to_array()
+        elif return_inverse:
+            return np.asarray(labels)
 
-    def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
-                  object na_value=None):
+    def unique(self, ndarray[object] values, bint return_inverse=False):
         uniques = ObjectVector()
-        labels = self._factorize(values, uniques=uniques,
-                                 na_sentinel=na_sentinel, na_value=na_value)
-        return labels, uniques.to_array()
+        return self._unique(values, uniques,
+                            False,  # ignore_na
+                            True,   # return_uniques
+                            return_inverse,
+                            # the rest are of the parameters are not relevant,
+                            # but we don't use kwargs to avoid cython perf hit
+                            0,      # count_prior
+                            -1,     # na_sentinel
+                            None)   # na_value
+
+    def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel,
+                  object na_value):
+        uniques = ObjectVector()
+        # factorize has reversed outputs compared to _unique (see "[::-1]")
+        return self._unique(values, uniques,
+                            True,  # ignore_na
+                            True,  # return_uniques
+                            True,  # return_inverse
+                            0,     # count_prior
+                            na_sentinel, na_value)[::-1]
 
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
-                   Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
-                   object na_value=None):
-        return self._factorize(values, uniques, count_prior=count_prior,
-                               na_sentinel=na_sentinel, na_value=na_value)
+                   Py_ssize_t count_prior, Py_ssize_t na_sentinel,
+                   object na_value):
+        return self._unique(values, uniques,
+                            True,   # ignore_na
+                            False,  # return_uniques
+                            True,   # return_inverse
+                            count_prior, na_sentinel, na_value)
 
 
 cdef class PyObjectHashTable(HashTable):
@@ -909,44 +907,12 @@ cdef class PyObjectHashTable(HashTable):
 
     @cython.boundscheck(False)
     @cython.wraparound(False)
-    def unique(self, ndarray[object] values):
-        """
-        Calculate unique values without sorting
-
-        Parameters
-        ----------
-        values : ndarray[object]
-            Array of values of which unique will be calculated
-
-        Returns
-        -------
-        uniques : ndarray[object]
-            Unique values of input, not sorted
-        """
-        cdef:
-            Py_ssize_t i, n = len(values)
-            int ret = 0
-            object val
-            khiter_t k
-            ObjectVector uniques = ObjectVector()
-
-        for i in range(n):
-            val = values[i]
-            hash(val)
-            k = kh_get_pymap(self.table, <PyObject*>val)
-            if k == self.table.n_buckets:
-                kh_put_pymap(self.table, <PyObject*>val, &ret)
-                uniques.append(val)
-
-        return uniques.to_array()
-
-    @cython.boundscheck(False)
-    @cython.wraparound(False)
-    def _factorize(self, ndarray[object] values, ObjectVector uniques,
-                   Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
-                   object na_value=None):
+    def _unique(self, ndarray[object] values, ObjectVector uniques,
+                bint ignore_na, bint return_uniques, bint return_inverse,
+                Py_ssize_t count_prior, Py_ssize_t na_sentinel,
+                object na_value):
         """
-        Calculate unique values and labels (no sorting); ignores all NA-values
+        Calculate unique values and labels (no sorting)
 
         Parameters
         ----------
@@ -954,19 +920,32 @@ cdef class PyObjectHashTable(HashTable):
             Array of values of which unique will be calculated
         uniques : ObjectVector
             Vector into which uniques will be written
-        count_prior : Py_ssize_t, default 0
+        ignore_na : boolean
+            Whether NA-values should be ignored for calculating the uniques. If
+            True, the labels corresponding to missing values will be set to
+            na_sentinel.
+        return_uniques : boolean
+            Whether to return the content of the passed "uniques" vector as an
+            np.ndarray at the end. If False, the vector passed to "uniques"
+            must be explicitly read and transformed by the user.
+        return_inverse : boolean
+            Whether the mapping of the original array values to their location
+            in the vector of uniques should be returned.
+        count_prior : Py_ssize_t
             Number of existing entries in uniques
-        na_sentinel : Py_ssize_t, default -1
+        na_sentinel : Py_ssize_t
             Sentinel value used for all NA-values in inverse
-        na_value : object, default None
+        na_value : object
             Value to identify as missing. If na_value is None, then None _plus_
-            any value satisfying val!=val are considered missing.
+            any value "val" satisfying val != val is considered missing.
+            If na_value is not None, then _additionally_, any value "val"
+            satisfying val == na_value is considered missing.
 
         Returns
         -------
-        uniques : ndarray[object]
+        uniques : ndarray[object] (if return_uniques)
             Unique values of input, not sorted
-        labels : ndarray[int64]
+        labels : ndarray[int64] (if return_inverse)
             The labels from values to uniques
         """
         cdef:
@@ -977,42 +956,69 @@ cdef class PyObjectHashTable(HashTable):
             khiter_t k
             bint use_na_value
 
-        labels = np.empty(n, dtype=np.int64)
+        if return_inverse:
+            labels = np.empty(n, dtype=np.int64)
         use_na_value = na_value is not None
 
         for i in range(n):
             val = values[i]
             hash(val)
 
-            if ((val != val or val is None)
-                    or (use_na_value and val == na_value)):
+            if ignore_na and ((val != val or val is None)
+                              or (use_na_value and val == na_value)):
                 labels[i] = na_sentinel
                 continue
 
             k = kh_get_pymap(self.table, <PyObject*>val)
-            if k != self.table.n_buckets:
-                # k falls into a previous bucket
-                idx = self.table.vals[k]
-                labels[i] = idx
-            else:
+            if k == self.table.n_buckets:
                 # k hasn't been seen yet
                 k = kh_put_pymap(self.table, <PyObject*>val, &ret)
-                self.table.vals[k] = count
                 uniques.append(val)
-                labels[i] = count
-                count += 1
+                if return_inverse:
+                    self.table.vals[k] = count
+                    labels[i] = count
+                    count += 1
+            elif return_inverse:
+                # k falls into a previous bucket
+                # only relevant in case we need to construct the inverse
+                idx = self.table.vals[k]
+                labels[i] = idx
 
-        return np.asarray(labels)
+        if return_uniques and return_inverse:
+            return uniques.to_array(), np.asarray(labels)
+        elif return_uniques:
+            return uniques.to_array()
+        elif return_inverse:
+            return np.asarray(labels)
 
-    def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
-                  object na_value=None):
+    def unique(self, ndarray[object] values, bint return_inverse=False):
+        uniques = ObjectVector()
+        return self._unique(values, uniques,
+                            False,  # ignore_na
+                            True,   # return_uniques
+                            return_inverse,
+                            # the rest are of the parameters are not relevant,
+                            # but we don't use kwargs to avoid cython perf hit
+                            0,      # count_prior
+                            -1,     # na_sentinel
+                            None)   # na_value
+
+    def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel,
+                  object na_value):
         uniques = ObjectVector()
-        labels = self._factorize(values, uniques=uniques,
-                                 na_sentinel=na_sentinel, na_value=na_value)
-        return labels, uniques.to_array()
+        # factorize has reversed outputs compared to _unique (see "[::-1]")
+        return self._unique(values, uniques,
+                            True,  # ignore_na
+                            True,  # return_uniques
+                            True,  # return_inverse
+                            0,     # count_prior
+                            na_sentinel, na_value)[::-1]
 
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
-                   Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
-                   object na_value=None):
-        return self._factorize(values, uniques, count_prior=count_prior,
-                               na_sentinel=na_sentinel, na_value=na_value)
+                   Py_ssize_t count_prior, Py_ssize_t na_sentinel,
+                   object na_value):
+        return self._unique(values, uniques,
+                            True,   # ignore_na
+                            False,  # return_uniques
+                            True,   # return_inverse
+                            count_prior, na_sentinel, na_value)
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index df2da26685a16..42cff6c431e37 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -365,7 +365,7 @@ def unique(values):
     htable, _, values, dtype, ndtype = _get_hashtable_algo(values)
 
     table = htable(len(values))
-    uniques = table.unique(values)
+    uniques = table.unique(values, False)
     uniques = _reconstruct_data(uniques, dtype, original)
 
     if isinstance(original, ABCSeries) and is_datetime64tz_dtype(dtype):
@@ -470,8 +470,7 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None,
     (hash_klass, _), values = _get_data_algo(values, _hashtables)
 
     table = hash_klass(size_hint or len(values))
-    labels, uniques = table.factorize(values, na_sentinel=na_sentinel,
-                                      na_value=na_value)
+    labels, uniques = table.factorize(values, na_sentinel, na_value)
 
     labels = ensure_platform_int(labels)
     return labels, uniques
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index d491df587fb4a..cf0273826aca3 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -1318,7 +1318,7 @@ def test_vector_resize(self, writable, htable, uniques, dtype,
         uniques = uniques()
 
         # get_labels may append to uniques
-        htable.get_labels(vals[:nvals], uniques, 0, -1)
+        htable.get_labels(vals[:nvals], uniques, 0, -1, None)
         # to_array() sets an external_view_exists flag on uniques.
         tmp = uniques.to_array()
         oldshape = tmp.shape
@@ -1326,10 +1326,10 @@ def test_vector_resize(self, writable, htable, uniques, dtype,
         # subsequent get_labels() calls can no longer append to it
         # (except for StringHashTables + ObjectVector)
         if safely_resizes:
-            htable.get_labels(vals, uniques, 0, -1)
+            htable.get_labels(vals, uniques, 0, -1, None)
         else:
             with tm.assert_raises_regex(ValueError, 'external reference.*'):
-                htable.get_labels(vals, uniques, 0, -1)
+                htable.get_labels(vals, uniques, 0, -1, None)
 
         uniques.to_array()   # should not raise here
         assert tmp.shape == oldshape
@@ -1358,9 +1358,18 @@ def test_hashtable_unique(self, htable, tm_dtype, writable):
         # drop_duplicates has own cython code (hash_table_func_helper.pxi)
         # and is tested separately; keeps first occurrence like ht.unique()
         expected_unique = s_duplicated.drop_duplicates(keep='first').values
-        result_unique = htable().unique(s_duplicated.values)
+        return_inverse = False
+        result_unique = htable().unique(s_duplicated.values, return_inverse)
         tm.assert_numpy_array_equal(result_unique, expected_unique)
 
+        # test with inverse
+        return_inverse = True
+        result_unique, result_inverse = htable().unique(s_duplicated.values,
+                                                        return_inverse)
+        tm.assert_numpy_array_equal(result_unique, expected_unique)
+        reconstr = result_unique[result_inverse]
+        tm.assert_numpy_array_equal(reconstr, s_duplicated.values)
+
     @pytest.mark.parametrize('htable, tm_dtype', [
         (ht.PyObjectHashTable, 'String'),
         (ht.StringHashTable, 'String'),
@@ -1383,7 +1392,10 @@ def test_hashtable_factorize(self, htable, tm_dtype, writable):
         s_duplicated.values.setflags(write=writable)
         na_mask = s_duplicated.isna().values
 
-        result_inverse, result_unique = htable().factorize(s_duplicated.values)
+        na_sentinel = -1
+        na_value = None
+        result = htable().factorize(s_duplicated.values, na_sentinel, na_value)
+        result_inverse, result_unique = result
 
         # drop_duplicates has own cython code (hash_table_func_helper.pxi)
         # and is tested separately; keeps first occurrence like ht.factorize()

From 4ed354a9544d0576aa1b98ffd26d3667677ed6ea Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Fri, 26 Oct 2018 18:33:51 +0200
Subject: [PATCH 02/17] Template over {return_inverse, ignore_na} for perf

---
 pandas/_libs/hashtable_class_helper.pxi.in | 505 +++++++++++++--------
 1 file changed, 325 insertions(+), 180 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index 6052768f69e84..6b50c65e29a4b 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -355,14 +355,87 @@ cdef class {{name}}HashTable(HashTable):
 
         return np.asarray(locs)
 
+    def unique(self, const {{dtype}}_t[:] values, bint return_inverse):
+        """
+        Calculate unique values and labels (no sorting!)
+
+        Parameters
+        ----------
+        values : ndarray[{{dtype}}]
+            Array of values of which unique will be calculated
+        return_inverse : boolean
+            Whether the mapping of the original array values to their location
+            in the vector of uniques should be returned.
+
+        Returns
+        -------
+        uniques : ndarray[{{dtype}}]
+            Unique values of input, not sorted
+        labels : ndarray[int64] (if return_inverse)
+            The labels from values to uniques
+        """
+        uniques = {{name}}Vector()
+        # explicitly compile path without inverse for performance
+        # the last three arguments are not relevant for this method, but we
+        # don't use kwargs to avoid cython perf hit (just using default values)
+        if return_inverse:
+            return self._unique_with_inverse(values, uniques, 0, -1, None)
+        return self._unique_no_inverse(values, uniques, 0, -1, None)
+
+    def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel,
+                  object na_value):
+        """
+        Calculate unique values and labels (no sorting!)
+
+        Missing values are not included in the "uniques" for this method.
+        The labels for any missing values will be set to "na_sentinel"
+
+        Parameters
+        ----------
+        values : ndarray[{{dtype}}]
+            Array of values of which unique will be calculated
+        na_sentinel : Py_ssize_t, default -1
+            Sentinel value used for all NA-values in inverse
+        na_value : object, default None
+            Value to identify as missing. If na_value is None, then
+            any value "val" satisfying val != val is considered missing.
+            If na_value is not None, then _additionally_, any value "val"
+            satisfying val == na_value is considered missing.
+
+        Returns
+        -------
+        labels : ndarray[int64]
+            The labels from values to uniques
+        uniques : ndarray[{{dtype}}]
+            Unique values of input, not sorted
+        """
+        # reduced signature compared to _factorize
+        # not necessary to have uniques-vector, count_prior
+        uniques = {{name}}Vector()
+        return self._factorize(values, uniques, 0, na_sentinel, na_value)
+
+{{py:
+# tuples of "func_name, return_inverse, ignore_na"
+unique_funcs = [('_unique_no_inverse', False, False),
+                ('_unique_with_inverse', True, False),
+                ('_factorize', True, True),
+                ('get_labels', True, True)]
+}}
+
+{{for func_name, return_inverse, ignore_na in unique_funcs}}
+
     @cython.boundscheck(False)
     @cython.wraparound(False)
-    def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
-                bint ignore_na, bint return_uniques,
-                bint return_inverse, Py_ssize_t count_prior,
-                Py_ssize_t na_sentinel, object na_value):
+    def {{func_name}}(self, const {{dtype}}_t[:] values,
+                      {{name}}Vector uniques, Py_ssize_t count_prior,
+                      Py_ssize_t na_sentinel, object na_value):
         """
         Calculate unique values and labels (no sorting!)
+{{if func_name == '_factorize' or func_name == 'get_labels'}}
+
+        Missing values are not included in the "uniques" for this method.
+        The labels for any missing values will be set to "na_sentinel"
+{{endif}}
 
         Parameters
         ----------
@@ -370,17 +443,6 @@ cdef class {{name}}HashTable(HashTable):
             Array of values of which unique will be calculated
         uniques : {{name}}Vector
             Vector into which uniques will be written
-        ignore_na : boolean
-            Whether NA-values should be ignored for calculating the uniques. If
-            True, the labels corresponding to missing values will be set to
-            na_sentinel.
-        return_uniques : boolean
-            Whether to return the content of the passed "uniques" vector as an
-            np.ndarray at the end. If False, the vector passed to "uniques"
-            must be explicitly read and transformed by the user.
-        return_inverse : boolean
-            Whether the mapping of the original array values to their location
-            in the vector of uniques should be returned.
         count_prior : Py_ssize_t
             Number of existing entries in uniques
         na_sentinel : Py_ssize_t
@@ -393,10 +455,23 @@ cdef class {{name}}HashTable(HashTable):
 
         Returns
         -------
-        uniques : ndarray[{{dtype}}] (if return_uniques)
+{{if func_name == '_unique_no_inverse'}}
+        uniques : ndarray[{{dtype}}]
             Unique values of input, not sorted
-        labels : ndarray[int64] (if return_inverse=True)
+{{elif func_name == '_unique_with_inverse'}}
+        uniques : ndarray[{{dtype}}]
+            Unique values of input, not sorted
+        labels : ndarray[int64]
+            The labels from values to uniques
+{{elif func_name == '_factorize'  # switched output order for factorize}}
+        labels : ndarray[int64]
             The labels from values to uniques
+        uniques : ndarray[{{dtype}}]
+            Unique values of input, not sorted
+{{elif func_name == 'get_labels'}}
+        labels : ndarray[int64]
+            The labels from values to uniques
+{{endif}}
         """
         cdef:
             Py_ssize_t i, idx, count = count_prior, n = len(values)
@@ -407,7 +482,7 @@ cdef class {{name}}HashTable(HashTable):
             {{name}}VectorData *ud
             bint use_na_value
 
-        if return_inverse:
+        if {{return_inverse}}:
             labels = np.empty(n, dtype=np.int64)
         ud = uniques.data
         use_na_value = na_value is not None
@@ -426,8 +501,8 @@ cdef class {{name}}HashTable(HashTable):
             for i in range(n):
                 val = values[i]
 
-                if ignore_na and (val != val
-                                  or (use_na_value and val == na_value2)):
+                if {{ignore_na}} and (val != val
+                                      or (use_na_value and val == na_value2)):
                     labels[i] = na_sentinel
                     continue
 
@@ -445,54 +520,27 @@ cdef class {{name}}HashTable(HashTable):
                                                  "Vector.resize() needed")
                             uniques.resize()
                     append_data_{{dtype}}(ud, val)
-                    if return_inverse:
+                    if {{return_inverse}}:
                         self.table.vals[k] = count
                         labels[i] = count
                         count += 1
-                elif return_inverse:
+                elif {{return_inverse}}:
                     # k falls into a previous bucket
                     # only relevant in case we need to construct the inverse
                     idx = self.table.vals[k]
                     labels[i] = idx
 
-        if return_uniques and return_inverse:
-            return uniques.to_array(), np.asarray(labels)
-        elif return_uniques:
-            return uniques.to_array()
-        elif return_inverse:
-            return np.asarray(labels)
-
-    def unique(self, const {{dtype}}_t[:] values, bint return_inverse):
-        uniques = {{name}}Vector()
-        return self._unique(values, uniques,
-                            False,  # ignore_na
-                            True,   # return_uniques
-                            return_inverse,
-                            # the rest are of the parameters are not relevant,
-                            # but we don't use kwargs to avoid cython perf hit
-                            0,      # count_prior
-                            -1,     # na_sentinel
-                            None)   # na_value
+{{if func_name == '_unique_no_inverse'}}
+        return uniques.to_array()
+{{elif func_name == '_unique_with_inverse'}}
+        return uniques.to_array(), np.asarray(labels)
+{{elif func_name == '_factorize'}}
+        return np.asarray(labels), uniques.to_array()
+{{elif func_name == 'get_labels'}}
+        return np.asarray(labels)
+{{endif}}
 
-    def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel,
-                  object na_value):
-        uniques = {{name}}Vector()
-        # factorize has reversed outputs compared to _unique (see "[::-1]")
-        return self._unique(values, uniques,
-                            True,  # ignore_na
-                            True,  # return_uniques
-                            True,  # return_inverse
-                            0,     # count_prior
-                            na_sentinel, na_value)[::-1]
-
-    def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
-                   Py_ssize_t count_prior, Py_ssize_t na_sentinel,
-                   object na_value):
-        return self._unique(values, uniques,
-                            True,   # ignore_na
-                            False,  # return_uniques
-                            True,   # return_inverse
-                            count_prior, na_sentinel, na_value)
+{{endfor}}
 
     @cython.boundscheck(False)
     def get_labels_groupby(self, const {{dtype}}_t[:] values):
@@ -679,14 +727,87 @@ cdef class StringHashTable(HashTable):
                 self.table.vals[k] = i
         free(vecs)
 
+    def unique(self, ndarray[object] values, bint return_inverse):
+        """
+        Calculate unique values and labels (no sorting!)
+
+        Parameters
+        ----------
+        values : ndarray[object]
+            Array of values of which unique will be calculated
+        return_inverse : boolean
+            Whether the mapping of the original array values to their location
+            in the vector of uniques should be returned.
+
+        Returns
+        -------
+        uniques : ndarray[object]
+            Unique values of input, not sorted
+        labels : ndarray[int64] (if return_inverse)
+            The labels from values to uniques
+        """
+        uniques = ObjectVector()
+        # explicitly compile path without inverse for performance
+        # the last three arguments are not relevant for this method, but we
+        # don't use kwargs to avoid cython perf hit (just using default values)
+        if return_inverse:
+            return self._unique_with_inverse(values, uniques, 0, -1, None)
+        return self._unique_no_inverse(values, uniques, 0, -1, None)
+
+    def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel,
+                  object na_value):
+        """
+        Calculate unique values and labels (no sorting!)
+
+        Missing values are not included in the "uniques" for this method.
+        The labels for any missing values will be set to "na_sentinel"
+
+        Parameters
+        ----------
+        values : ndarray[object]
+            Array of values of which unique will be calculated
+        na_sentinel : Py_ssize_t, default -1
+            Sentinel value used for all NA-values in inverse
+        na_value : object, default None
+            Value to identify as missing. If na_value is None, then any value
+            that is not a string is considered missing. If na_value is
+            not None, then _additionally_ any value "val" satisfying
+            val == na_value is considered missing.
+
+        Returns
+        -------
+        labels : ndarray[int64]
+            The labels from values to uniques
+        uniques : ndarray[object]
+            Unique values of input, not sorted
+        """
+        # reduced signature compared to _factorize
+        # not necessary to have uniques-vector, count_prior
+        uniques = ObjectVector()
+        return self._factorize(values, uniques, 0, na_sentinel, na_value)
+
+{{py:
+# tuples of "func_name, return_inverse, ignore_na"
+unique_funcs = [('_unique_no_inverse', False, False),
+                ('_unique_with_inverse', True, False),
+                ('_factorize', True, True),
+                ('get_labels', True, True)]
+}}
+
+{{for func_name, return_inverse, ignore_na in unique_funcs}}
+
     @cython.boundscheck(False)
     @cython.wraparound(False)
-    def _unique(self, ndarray[object] values, ObjectVector uniques,
-                bint ignore_na, bint return_uniques, bint return_inverse,
-                Py_ssize_t count_prior, Py_ssize_t na_sentinel,
-                object na_value):
+    def {{func_name}}(self, ndarray[object] values, ObjectVector uniques,
+                      Py_ssize_t count_prior, Py_ssize_t na_sentinel,
+                      object na_value):
         """
-        Calculate unique values and labels (no sorting)
+        Calculate unique values and labels (no sorting!)
+{{if func_name == '_factorize' or func_name == 'get_labels'}}
+
+        Missing values are not included in the "uniques" for this method.
+        The labels for any missing values will be set to "na_sentinel"
+{{endif}}
 
         Parameters
         ----------
@@ -694,17 +815,6 @@ cdef class StringHashTable(HashTable):
             Array of values of which unique will be calculated
         uniques : ObjectVector
             Vector into which uniques will be written
-        ignore_na : boolean
-            Whether NA-values should be ignored for calculating the uniques. If
-            True, the labels corresponding to missing values will be set to
-            na_sentinel.
-        return_uniques : boolean
-            Whether to return the content of the passed "uniques" vector as an
-            np.ndarray at the end. If False, the vector passed to "uniques"
-            must be explicitly read and transformed by the user.
-        return_inverse : boolean
-            Whether the mapping of the original array values to their location
-            in the vector of uniques should be returned.
         count_prior : Py_ssize_t
             Number of existing entries in uniques
         na_sentinel : Py_ssize_t
@@ -717,10 +827,23 @@ cdef class StringHashTable(HashTable):
 
         Returns
         -------
-        uniques : ndarray[object] (if return_uniques)
+{{if func_name == '_unique_no_inverse'}}
+        uniques : ndarray[object]
             Unique values of input, not sorted
-        labels : ndarray[int64] (if return_inverse)
+{{elif func_name == '_unique_with_inverse'}}
+        uniques : ndarray[object]
+            Unique values of input, not sorted
+        labels : ndarray[int64]
+            The labels from values to uniques
+{{elif func_name == '_factorize'  # switched output order for factorize}}
+        labels : ndarray[int64]
             The labels from values to uniques
+        uniques : ndarray[object]
+            Unique values of input, not sorted
+{{elif func_name == 'get_labels'}}
+        labels : ndarray[int64]
+            The labels from values to uniques
+{{endif}}
         """
         cdef:
             Py_ssize_t i, idx, count = count_prior, n = len(values)
@@ -733,7 +856,7 @@ cdef class StringHashTable(HashTable):
             khiter_t k
             bint use_na_value
 
-        if return_inverse:
+        if {{return_inverse}}:
             labels = np.zeros(n, dtype=np.int64)
         uindexer = np.empty(n, dtype=np.int64)
         use_na_value = na_value is not None
@@ -743,8 +866,9 @@ cdef class StringHashTable(HashTable):
         for i in range(n):
             val = values[i]
 
-            if ignore_na and (not (PyUnicode_Check(val) or PyString_Check(val))
-                              or (use_na_value and val == na_value)):
+            if ({{ignore_na}}
+                and (not (PyUnicode_Check(val) or PyString_Check(val))
+                     or (use_na_value and val == na_value))):
                 # missing value
                 labels[i] = na_sentinel
             else:
@@ -755,7 +879,7 @@ cdef class StringHashTable(HashTable):
         # compute
         with nogil:
             for i in range(n):
-                if ignore_na and labels[i] == na_sentinel:
+                if {{ignore_na}} and labels[i] == na_sentinel:
                     continue
 
                 v = vecs[i]
@@ -764,11 +888,11 @@ cdef class StringHashTable(HashTable):
                     # k hasn't been seen yet
                     k = kh_put_str(self.table, v, &ret)
                     uindexer[count] = i
-                    if return_inverse:
+                    if {{return_inverse}}:
                         self.table.vals[k] = count
                         labels[i] = <int64_t>count
                     count += 1
-                elif return_inverse:
+                elif {{return_inverse}}:
                     # k falls into a previous bucket
                     # only relevant in case we need to construct the inverse
                     idx = self.table.vals[k]
@@ -780,44 +904,17 @@ cdef class StringHashTable(HashTable):
         for i in range(count):
             uniques.append(values[uindexer[i]])
 
-        if return_uniques and return_inverse:
-            return uniques.to_array(), np.asarray(labels)
-        elif return_uniques:
-            return uniques.to_array()
-        elif return_inverse:
-            return np.asarray(labels)
-
-    def unique(self, ndarray[object] values, bint return_inverse=False):
-        uniques = ObjectVector()
-        return self._unique(values, uniques,
-                            False,  # ignore_na
-                            True,   # return_uniques
-                            return_inverse,
-                            # the rest are of the parameters are not relevant,
-                            # but we don't use kwargs to avoid cython perf hit
-                            0,      # count_prior
-                            -1,     # na_sentinel
-                            None)   # na_value
+{{if func_name == '_unique_no_inverse'}}
+        return uniques.to_array()
+{{elif func_name == '_unique_with_inverse'}}
+        return uniques.to_array(), np.asarray(labels)
+{{elif func_name == '_factorize'}}
+        return np.asarray(labels), uniques.to_array()
+{{elif func_name == 'get_labels'}}
+        return np.asarray(labels)
+{{endif}}
 
-    def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel,
-                  object na_value):
-        uniques = ObjectVector()
-        # factorize has reversed outputs compared to _unique (see "[::-1]")
-        return self._unique(values, uniques,
-                            True,  # ignore_na
-                            True,  # return_uniques
-                            True,  # return_inverse
-                            0,     # count_prior
-                            na_sentinel, na_value)[::-1]
-
-    def get_labels(self, ndarray[object] values, ObjectVector uniques,
-                   Py_ssize_t count_prior, Py_ssize_t na_sentinel,
-                   object na_value):
-        return self._unique(values, uniques,
-                            True,   # ignore_na
-                            False,  # return_uniques
-                            True,   # return_inverse
-                            count_prior, na_sentinel, na_value)
+{{endfor}}
 
 
 cdef class PyObjectHashTable(HashTable):
@@ -905,14 +1002,87 @@ cdef class PyObjectHashTable(HashTable):
 
         return np.asarray(locs)
 
+    def unique(self, ndarray[object] values, bint return_inverse):
+        """
+        Calculate unique values and labels (no sorting!)
+
+        Parameters
+        ----------
+        values : ndarray[object]
+            Array of values of which unique will be calculated
+        return_inverse : boolean
+            Whether the mapping of the original array values to their location
+            in the vector of uniques should be returned.
+
+        Returns
+        -------
+        uniques : ndarray[object]
+            Unique values of input, not sorted
+        labels : ndarray[int64] (if return_inverse)
+            The labels from values to uniques
+        """
+        uniques = ObjectVector()
+        # explicitly compile path without inverse for performance
+        # the last three arguments are not relevant for this method, but we
+        # don't use kwargs to avoid cython perf hit (just using default values)
+        if return_inverse:
+            return self._unique_with_inverse(values, uniques, 0, -1, None)
+        return self._unique_no_inverse(values, uniques, 0, -1, None)
+
+    def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel,
+                  object na_value):
+        """
+        Calculate unique values and labels (no sorting!)
+
+        Missing values are not included in the "uniques" for this method.
+        The labels for any missing values will be set to "na_sentinel"
+
+        Parameters
+        ----------
+        values : ndarray[object]
+            Array of values of which unique will be calculated
+        na_sentinel : Py_ssize_t, default -1
+            Sentinel value used for all NA-values in inverse
+        na_value : object, default None
+            Value to identify as missing. If na_value is None, then None _plus_
+            any value "val" satisfying val != val is considered missing.
+            If na_value is not None, then _additionally_, any value "val"
+            satisfying val == na_value is considered missing.
+
+        Returns
+        -------
+        labels : ndarray[int64]
+            The labels from values to uniques
+        uniques : ndarray[object]
+            Unique values of input, not sorted
+        """
+        # reduced signature compared to _factorize
+        # not necessary to have uniques-vector, count_prior
+        uniques = ObjectVector()
+        return self._factorize(values, uniques, 0, na_sentinel, na_value)
+
+{{py:
+# tuples of "func_name, return_inverse, ignore_na"
+unique_funcs = [('_unique_no_inverse', False, False),
+                ('_unique_with_inverse', True, False),
+                ('_factorize', True, True),
+                ('get_labels', True, True)]
+}}
+
+{{for func_name, return_inverse, ignore_na in unique_funcs}}
+
     @cython.boundscheck(False)
     @cython.wraparound(False)
-    def _unique(self, ndarray[object] values, ObjectVector uniques,
-                bint ignore_na, bint return_uniques, bint return_inverse,
-                Py_ssize_t count_prior, Py_ssize_t na_sentinel,
-                object na_value):
+    def {{func_name}}(self, ndarray[object] values, ObjectVector uniques,
+                      Py_ssize_t count_prior, Py_ssize_t na_sentinel,
+                      object na_value):
         """
-        Calculate unique values and labels (no sorting)
+        Calculate unique values and labels (no sorting!)
+{{if func_name == '_factorize' or func_name == 'get_labels'}}
+
+        Missing values are not included in the "uniques" for this method.
+        The labels for any missing values will be set to "na_sentinel"
+{{endif}}
 
         Parameters
         ----------
@@ -920,17 +1090,6 @@ cdef class PyObjectHashTable(HashTable):
             Array of values of which unique will be calculated
         uniques : ObjectVector
             Vector into which uniques will be written
-        ignore_na : boolean
-            Whether NA-values should be ignored for calculating the uniques. If
-            True, the labels corresponding to missing values will be set to
-            na_sentinel.
-        return_uniques : boolean
-            Whether to return the content of the passed "uniques" vector as an
-            np.ndarray at the end. If False, the vector passed to "uniques"
-            must be explicitly read and transformed by the user.
-        return_inverse : boolean
-            Whether the mapping of the original array values to their location
-            in the vector of uniques should be returned.
         count_prior : Py_ssize_t
             Number of existing entries in uniques
         na_sentinel : Py_ssize_t
@@ -943,10 +1102,23 @@ cdef class PyObjectHashTable(HashTable):
 
         Returns
         -------
-        uniques : ndarray[object] (if return_uniques)
+{{if func_name == '_unique_no_inverse'}}
+        uniques : ndarray[object]
             Unique values of input, not sorted
-        labels : ndarray[int64] (if return_inverse)
+{{elif func_name == '_unique_with_inverse'}}
+        uniques : ndarray[object]
+            Unique values of input, not sorted
+        labels : ndarray[int64]
             The labels from values to uniques
+{{elif func_name == '_factorize'  # switched output order for factorize}}
+        labels : ndarray[int64]
+            The labels from values to uniques
+        uniques : ndarray[object]
+            Unique values of input, not sorted
+{{elif func_name == 'get_labels'}}
+        labels : ndarray[int64]
+            The labels from values to uniques
+{{endif}}
         """
         cdef:
             Py_ssize_t i, idx, count = count_prior, n = len(values)
@@ -956,7 +1128,7 @@ cdef class PyObjectHashTable(HashTable):
             khiter_t k
             bint use_na_value
 
-        if return_inverse:
+        if {{return_inverse}}:
             labels = np.empty(n, dtype=np.int64)
         use_na_value = na_value is not None
 
@@ -964,8 +1136,8 @@ cdef class PyObjectHashTable(HashTable):
             val = values[i]
             hash(val)
 
-            if ignore_na and ((val != val or val is None)
-                              or (use_na_value and val == na_value)):
+            if {{ignore_na}} and ((val != val or val is None)
+                                  or (use_na_value and val == na_value)):
                 labels[i] = na_sentinel
                 continue
 
@@ -974,51 +1146,24 @@ cdef class PyObjectHashTable(HashTable):
                 # k hasn't been seen yet
                 k = kh_put_pymap(self.table, <PyObject*>val, &ret)
                 uniques.append(val)
-                if return_inverse:
+                if {{return_inverse}}:
                     self.table.vals[k] = count
                     labels[i] = count
                     count += 1
-            elif return_inverse:
+            elif {{return_inverse}}:
                 # k falls into a previous bucket
                 # only relevant in case we need to construct the inverse
                 idx = self.table.vals[k]
                 labels[i] = idx
 
-        if return_uniques and return_inverse:
-            return uniques.to_array(), np.asarray(labels)
-        elif return_uniques:
-            return uniques.to_array()
-        elif return_inverse:
-            return np.asarray(labels)
-
-    def unique(self, ndarray[object] values, bint return_inverse=False):
-        uniques = ObjectVector()
-        return self._unique(values, uniques,
-                            False,  # ignore_na
-                            True,   # return_uniques
-                            return_inverse,
-                            # the rest are of the parameters are not relevant,
-                            # but we don't use kwargs to avoid cython perf hit
-                            0,      # count_prior
-                            -1,     # na_sentinel
-                            None)   # na_value
+{{if func_name == '_unique_no_inverse'}}
+        return uniques.to_array()
+{{elif func_name == '_unique_with_inverse'}}
+        return uniques.to_array(), np.asarray(labels)
+{{elif func_name == '_factorize'}}
+        return np.asarray(labels), uniques.to_array()
+{{elif func_name == 'get_labels'}}
+        return np.asarray(labels)
+{{endif}}
 
-    def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel,
-                  object na_value):
-        uniques = ObjectVector()
-        # factorize has reversed outputs compared to _unique (see "[::-1]")
-        return self._unique(values, uniques,
-                            True,  # ignore_na
-                            True,  # return_uniques
-                            True,  # return_inverse
-                            0,     # count_prior
-                            na_sentinel, na_value)[::-1]
-
-    def get_labels(self, ndarray[object] values, ObjectVector uniques,
-                   Py_ssize_t count_prior, Py_ssize_t na_sentinel,
-                   object na_value):
-        return self._unique(values, uniques,
-                            True,   # ignore_na
-                            False,  # return_uniques
-                            True,   # return_inverse
-                            count_prior, na_sentinel, na_value)
+{{endfor}}

From 906cd50e8391148a19fb93487b7fedb25ab9e767 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Fri, 26 Oct 2018 19:18:55 +0200
Subject: [PATCH 03/17] Re-add kwargs to method signature

---
 pandas/_libs/hashtable.pyx                 |  3 +-
 pandas/_libs/hashtable_class_helper.pxi.in | 70 ++++++++++------------
 pandas/core/algorithms.py                  |  5 +-
 pandas/tests/test_algos.py                 | 17 ++----
 4 files changed, 43 insertions(+), 52 deletions(-)

diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx
index 1d5f637a05188..2ced98198afc6 100644
--- a/pandas/_libs/hashtable.pyx
+++ b/pandas/_libs/hashtable.pyx
@@ -124,7 +124,8 @@ cdef class Int64Factorizer:
             uniques.extend(self.uniques.to_array())
             self.uniques = uniques
         labels = self.table.get_labels(values, self.uniques,
-                                       self.count, na_sentinel, na_value)
+                                       self.count, na_sentinel,
+                                       na_value=na_value)
 
         # sort on
         if sort:
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index 6b50c65e29a4b..260c9bab91e93 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -355,7 +355,7 @@ cdef class {{name}}HashTable(HashTable):
 
         return np.asarray(locs)
 
-    def unique(self, const {{dtype}}_t[:] values, bint return_inverse):
+    def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -363,7 +363,7 @@ cdef class {{name}}HashTable(HashTable):
         ----------
         values : ndarray[{{dtype}}]
             Array of values of which unique will be calculated
-        return_inverse : boolean
+        return_inverse : boolean, default False
             Whether the mapping of the original array values to their location
             in the vector of uniques should be returned.
 
@@ -376,14 +376,12 @@ cdef class {{name}}HashTable(HashTable):
         """
         uniques = {{name}}Vector()
         # explicitly compile path without inverse for performance
-        # the last three arguments are not relevant for this method, but we
-        # don't use kwargs to avoid cython perf hit (just using default values)
         if return_inverse:
-            return self._unique_with_inverse(values, uniques, 0, -1, None)
-        return self._unique_no_inverse(values, uniques, 0, -1, None)
+            return self._unique_with_inverse(values, uniques)
+        return self._unique_no_inverse(values, uniques)
 
-    def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel,
-                  object na_value):
+    def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
+                  object na_value=None):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -427,8 +425,8 @@ unique_funcs = [('_unique_no_inverse', False, False),
     @cython.boundscheck(False)
     @cython.wraparound(False)
     def {{func_name}}(self, const {{dtype}}_t[:] values,
-                      {{name}}Vector uniques, Py_ssize_t count_prior,
-                      Py_ssize_t na_sentinel, object na_value):
+                      {{name}}Vector uniques, Py_ssize_t count_prior=0,
+                      Py_ssize_t na_sentinel=-1, object na_value=None):
         """
         Calculate unique values and labels (no sorting!)
 {{if func_name == '_factorize' or func_name == 'get_labels'}}
@@ -443,11 +441,11 @@ unique_funcs = [('_unique_no_inverse', False, False),
             Array of values of which unique will be calculated
         uniques : {{name}}Vector
             Vector into which uniques will be written
-        count_prior : Py_ssize_t
+        count_prior : Py_ssize_t, default 0
             Number of existing entries in uniques
-        na_sentinel : Py_ssize_t
+        na_sentinel : Py_ssize_t, default -1
             Sentinel value used for all NA-values in inverse
-        na_value : object
+        na_value : object, default None
             Value to identify as missing. If na_value is None, then
             any value "val" satisfying val != val is considered missing.
             If na_value is not None, then _additionally_, any value "val"
@@ -727,7 +725,7 @@ cdef class StringHashTable(HashTable):
                 self.table.vals[k] = i
         free(vecs)
 
-    def unique(self, ndarray[object] values, bint return_inverse):
+    def unique(self, ndarray[object] values, bint return_inverse=False):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -735,7 +733,7 @@ cdef class StringHashTable(HashTable):
         ----------
         values : ndarray[object]
             Array of values of which unique will be calculated
-        return_inverse : boolean
+        return_inverse : boolean, default False
             Whether the mapping of the original array values to their location
             in the vector of uniques should be returned.
 
@@ -748,14 +746,12 @@ cdef class StringHashTable(HashTable):
         """
         uniques = ObjectVector()
         # explicitly compile path without inverse for performance
-        # the last three arguments are not relevant for this method, but we
-        # don't use kwargs to avoid cython perf hit (just using default values)
         if return_inverse:
-            return self._unique_with_inverse(values, uniques, 0, -1, None)
-        return self._unique_no_inverse(values, uniques, 0, -1, None)
+            return self._unique_with_inverse(values, uniques)
+        return self._unique_no_inverse(values, uniques)
 
-    def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel,
-                  object na_value):
+    def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
+                  object na_value=None):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -799,8 +795,8 @@ unique_funcs = [('_unique_no_inverse', False, False),
     @cython.boundscheck(False)
     @cython.wraparound(False)
     def {{func_name}}(self, ndarray[object] values, ObjectVector uniques,
-                      Py_ssize_t count_prior, Py_ssize_t na_sentinel,
-                      object na_value):
+                      Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+                      object na_value=None):
         """
         Calculate unique values and labels (no sorting!)
 {{if func_name == '_factorize' or func_name == 'get_labels'}}
@@ -815,11 +811,11 @@ unique_funcs = [('_unique_no_inverse', False, False),
             Array of values of which unique will be calculated
         uniques : ObjectVector
             Vector into which uniques will be written
-        count_prior : Py_ssize_t
+        count_prior : Py_ssize_t, default 0
             Number of existing entries in uniques
-        na_sentinel : Py_ssize_t
+        na_sentinel : Py_ssize_t, default -1
             Sentinel value used for all NA-values in inverse
-        na_value : object
+        na_value : object, default None
             Value to identify as missing. If na_value is None, then any value
             that is not a string is considered missing. If na_value is
             not None, then _additionally_ any value "val" satisfying
@@ -1002,7 +998,7 @@ cdef class PyObjectHashTable(HashTable):
 
         return np.asarray(locs)
 
-    def unique(self, ndarray[object] values, bint return_inverse):
+    def unique(self, ndarray[object] values, bint return_inverse=False):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -1010,7 +1006,7 @@ cdef class PyObjectHashTable(HashTable):
         ----------
         values : ndarray[object]
             Array of values of which unique will be calculated
-        return_inverse : boolean
+        return_inverse : boolean, default False
             Whether the mapping of the original array values to their location
             in the vector of uniques should be returned.
 
@@ -1023,14 +1019,12 @@ cdef class PyObjectHashTable(HashTable):
         """
         uniques = ObjectVector()
         # explicitly compile path without inverse for performance
-        # the last three arguments are not relevant for this method, but we
-        # don't use kwargs to avoid cython perf hit (just using default values)
         if return_inverse:
-            return self._unique_with_inverse(values, uniques, 0, -1, None)
-        return self._unique_no_inverse(values, uniques, 0, -1, None)
+            return self._unique_with_inverse(values, uniques)
+        return self._unique_no_inverse(values, uniques)
 
-    def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel,
-                  object na_value):
+    def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
+                  object na_value=None):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -1074,8 +1068,8 @@ unique_funcs = [('_unique_no_inverse', False, False),
     @cython.boundscheck(False)
     @cython.wraparound(False)
     def {{func_name}}(self, ndarray[object] values, ObjectVector uniques,
-                      Py_ssize_t count_prior, Py_ssize_t na_sentinel,
-                      object na_value):
+                      Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+                      object na_value=None):
         """
         Calculate unique values and labels (no sorting!)
 {{if func_name == '_factorize' or func_name == 'get_labels'}}
@@ -1090,9 +1084,9 @@ unique_funcs = [('_unique_no_inverse', False, False),
             Array of values of which unique will be calculated
         uniques : ObjectVector
             Vector into which uniques will be written
-        count_prior : Py_ssize_t
+        count_prior : Py_ssize_t, default 0
             Number of existing entries in uniques
-        na_sentinel : Py_ssize_t
+        na_sentinel : Py_ssize_t, default -1
             Sentinel value used for all NA-values in inverse
         na_value : object
             Value to identify as missing. If na_value is None, then None _plus_
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 42cff6c431e37..df2da26685a16 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -365,7 +365,7 @@ def unique(values):
     htable, _, values, dtype, ndtype = _get_hashtable_algo(values)
 
     table = htable(len(values))
-    uniques = table.unique(values, False)
+    uniques = table.unique(values)
     uniques = _reconstruct_data(uniques, dtype, original)
 
     if isinstance(original, ABCSeries) and is_datetime64tz_dtype(dtype):
@@ -470,7 +470,8 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None,
     (hash_klass, _), values = _get_data_algo(values, _hashtables)
 
     table = hash_klass(size_hint or len(values))
-    labels, uniques = table.factorize(values, na_sentinel, na_value)
+    labels, uniques = table.factorize(values, na_sentinel=na_sentinel,
+                                      na_value=na_value)
 
     labels = ensure_platform_int(labels)
     return labels, uniques
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index cf0273826aca3..cd4abc7a67526 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -1318,7 +1318,7 @@ def test_vector_resize(self, writable, htable, uniques, dtype,
         uniques = uniques()
 
         # get_labels may append to uniques
-        htable.get_labels(vals[:nvals], uniques, 0, -1, None)
+        htable.get_labels(vals[:nvals], uniques, 0, -1)
         # to_array() sets an external_view_exists flag on uniques.
         tmp = uniques.to_array()
         oldshape = tmp.shape
@@ -1326,10 +1326,10 @@ def test_vector_resize(self, writable, htable, uniques, dtype,
         # subsequent get_labels() calls can no longer append to it
         # (except for StringHashTables + ObjectVector)
         if safely_resizes:
-            htable.get_labels(vals, uniques, 0, -1, None)
+            htable.get_labels(vals, uniques, 0, -1)
         else:
             with tm.assert_raises_regex(ValueError, 'external reference.*'):
-                htable.get_labels(vals, uniques, 0, -1, None)
+                htable.get_labels(vals, uniques, 0, -1)
 
         uniques.to_array()   # should not raise here
         assert tmp.shape == oldshape
@@ -1358,14 +1358,12 @@ def test_hashtable_unique(self, htable, tm_dtype, writable):
         # drop_duplicates has own cython code (hash_table_func_helper.pxi)
         # and is tested separately; keeps first occurrence like ht.unique()
         expected_unique = s_duplicated.drop_duplicates(keep='first').values
-        return_inverse = False
-        result_unique = htable().unique(s_duplicated.values, return_inverse)
+        result_unique = htable().unique(s_duplicated.values)
         tm.assert_numpy_array_equal(result_unique, expected_unique)
 
         # test with inverse
-        return_inverse = True
         result_unique, result_inverse = htable().unique(s_duplicated.values,
-                                                        return_inverse)
+                                                        return_inverse=True)
         tm.assert_numpy_array_equal(result_unique, expected_unique)
         reconstr = result_unique[result_inverse]
         tm.assert_numpy_array_equal(reconstr, s_duplicated.values)
@@ -1392,10 +1390,7 @@ def test_hashtable_factorize(self, htable, tm_dtype, writable):
         s_duplicated.values.setflags(write=writable)
         na_mask = s_duplicated.isna().values
 
-        na_sentinel = -1
-        na_value = None
-        result = htable().factorize(s_duplicated.values, na_sentinel, na_value)
-        result_inverse, result_unique = result
+        result_inverse, result_unique = htable().factorize(s_duplicated.values)
 
         # drop_duplicates has own cython code (hash_table_func_helper.pxi)
         # and is tested separately; keeps first occurrence like ht.factorize()

From 19c7c1f8d240c508b212db6b8d940620164d1566 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Sun, 28 Oct 2018 19:11:27 +0100
Subject: [PATCH 04/17] Fix small oversight

---
 pandas/_libs/hashtable_class_helper.pxi.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index 260c9bab91e93..f59a287a9dc3b 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -1088,7 +1088,7 @@ unique_funcs = [('_unique_no_inverse', False, False),
             Number of existing entries in uniques
         na_sentinel : Py_ssize_t, default -1
             Sentinel value used for all NA-values in inverse
-        na_value : object
+        na_value : object, default None
             Value to identify as missing. If na_value is None, then None _plus_
             any value "val" satisfying val != val is considered missing.
             If na_value is not None, then _additionally_, any value "val"

From a8f079f32a2d914bd432520e49c27f740f0bd14a Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Mon, 29 Oct 2018 07:47:01 +0100
Subject: [PATCH 05/17] Simplify an if-condition

---
 pandas/_libs/hashtable_class_helper.pxi.in | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index f59a287a9dc3b..789a3820ddb09 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -429,7 +429,7 @@ unique_funcs = [('_unique_no_inverse', False, False),
                       Py_ssize_t na_sentinel=-1, object na_value=None):
         """
         Calculate unique values and labels (no sorting!)
-{{if func_name == '_factorize' or func_name == 'get_labels'}}
+{{if ignore_na}}
 
         Missing values are not included in the "uniques" for this method.
         The labels for any missing values will be set to "na_sentinel"
@@ -799,7 +799,7 @@ unique_funcs = [('_unique_no_inverse', False, False),
                       object na_value=None):
         """
         Calculate unique values and labels (no sorting!)
-{{if func_name == '_factorize' or func_name == 'get_labels'}}
+{{if ignore_na}}
 
         Missing values are not included in the "uniques" for this method.
         The labels for any missing values will be set to "na_sentinel"
@@ -1072,7 +1072,7 @@ unique_funcs = [('_unique_no_inverse', False, False),
                       object na_value=None):
         """
         Calculate unique values and labels (no sorting!)
-{{if func_name == '_factorize' or func_name == 'get_labels'}}
+{{if ignore_na}}
 
         Missing values are not included in the "uniques" for this method.
         The labels for any missing values will be set to "na_sentinel"

From 1c5b97ae73e6b4255d9bae49c2f713ea22564e72 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Mon, 29 Oct 2018 07:50:36 +0100
Subject: [PATCH 06/17] Reword comment

---
 pandas/tests/test_algos.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index cd4abc7a67526..27c51983adef9 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -1361,7 +1361,8 @@ def test_hashtable_unique(self, htable, tm_dtype, writable):
         result_unique = htable().unique(s_duplicated.values)
         tm.assert_numpy_array_equal(result_unique, expected_unique)
 
-        # test with inverse
+        # test return_inverse=True
+        # reconstruction can only succeed if the inverse is correct
         result_unique, result_inverse = htable().unique(s_duplicated.values,
                                                         return_inverse=True)
         tm.assert_numpy_array_equal(result_unique, expected_unique)

From c7327fd0d80240cef76cfe019266c00a105befdc Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Wed, 31 Oct 2018 19:19:45 +0100
Subject: [PATCH 07/17] Revert templating over {return_inverse, ignore_na}

---
 pandas/_libs/hashtable_class_helper.pxi.in | 555 +++++++++------------
 1 file changed, 246 insertions(+), 309 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index 789a3820ddb09..0d9b56880eb34 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -355,85 +355,14 @@ cdef class {{name}}HashTable(HashTable):
 
         return np.asarray(locs)
 
-    def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
-        """
-        Calculate unique values and labels (no sorting!)
-
-        Parameters
-        ----------
-        values : ndarray[{{dtype}}]
-            Array of values of which unique will be calculated
-        return_inverse : boolean, default False
-            Whether the mapping of the original array values to their location
-            in the vector of uniques should be returned.
-
-        Returns
-        -------
-        uniques : ndarray[{{dtype}}]
-            Unique values of input, not sorted
-        labels : ndarray[int64] (if return_inverse)
-            The labels from values to uniques
-        """
-        uniques = {{name}}Vector()
-        # explicitly compile path without inverse for performance
-        if return_inverse:
-            return self._unique_with_inverse(values, uniques)
-        return self._unique_no_inverse(values, uniques)
-
-    def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
-                  object na_value=None):
-        """
-        Calculate unique values and labels (no sorting!)
-
-        Missing values are not included in the "uniques" for this method.
-        The labels for any missing values will be set to "na_sentinel"
-
-        Parameters
-        ----------
-        values : ndarray[{{dtype}}]
-            Array of values of which unique will be calculated
-        na_sentinel : Py_ssize_t, default -1
-            Sentinel value used for all NA-values in inverse
-        na_value : object, default None
-            Value to identify as missing. If na_value is None, then
-            any value "val" satisfying val != val is considered missing.
-            If na_value is not None, then _additionally_, any value "val"
-            satisfying val == na_value is considered missing.
-
-        Returns
-        -------
-        labels : ndarray[int64]
-            The labels from values to uniques
-        uniques : ndarray[{{dtype}}]
-            Unique values of input, not sorted
-        """
-        # reduced signature compared to _factorize
-        # not necessary to have uniques-vector, count_prior
-        uniques = {{name}}Vector()
-        return self._factorize(values, uniques, 0, na_sentinel, na_value)
-
-{{py:
-# tuples of "func_name, return_inverse, ignore_na"
-unique_funcs = [('_unique_no_inverse', False, False),
-                ('_unique_with_inverse', True, False),
-                ('_factorize', True, True),
-                ('get_labels', True, True)]
-}}
-
-{{for func_name, return_inverse, ignore_na in unique_funcs}}
-
     @cython.boundscheck(False)
     @cython.wraparound(False)
-    def {{func_name}}(self, const {{dtype}}_t[:] values,
-                      {{name}}Vector uniques, Py_ssize_t count_prior=0,
-                      Py_ssize_t na_sentinel=-1, object na_value=None):
+    def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
+                bint ignore_na=False, bint return_inverse=False,
+                Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+                object na_value=None):
         """
         Calculate unique values and labels (no sorting!)
-{{if ignore_na}}
-
-        Missing values are not included in the "uniques" for this method.
-        The labels for any missing values will be set to "na_sentinel"
-{{endif}}
 
         Parameters
         ----------
@@ -441,6 +370,13 @@ unique_funcs = [('_unique_no_inverse', False, False),
             Array of values of which unique will be calculated
         uniques : {{name}}Vector
             Vector into which uniques will be written
+        ignore_na : boolean, default False
+            Whether NA-values should be ignored for calculating the uniques. If
+            True, the labels corresponding to missing values will be set to
+            na_sentinel.
+        return_inverse : boolean, default False
+            Whether the mapping of the original array values to their location
+            in the vector of uniques should be returned.
         count_prior : Py_ssize_t, default 0
             Number of existing entries in uniques
         na_sentinel : Py_ssize_t, default -1
@@ -453,23 +389,10 @@ unique_funcs = [('_unique_no_inverse', False, False),
 
         Returns
         -------
-{{if func_name == '_unique_no_inverse'}}
-        uniques : ndarray[{{dtype}}]
-            Unique values of input, not sorted
-{{elif func_name == '_unique_with_inverse'}}
-        uniques : ndarray[{{dtype}}]
-            Unique values of input, not sorted
-        labels : ndarray[int64]
-            The labels from values to uniques
-{{elif func_name == '_factorize'  # switched output order for factorize}}
-        labels : ndarray[int64]
-            The labels from values to uniques
         uniques : ndarray[{{dtype}}]
             Unique values of input, not sorted
-{{elif func_name == 'get_labels'}}
-        labels : ndarray[int64]
+        labels : ndarray[int64] (if return_inverse=True)
             The labels from values to uniques
-{{endif}}
         """
         cdef:
             Py_ssize_t i, idx, count = count_prior, n = len(values)
@@ -480,7 +403,7 @@ unique_funcs = [('_unique_no_inverse', False, False),
             {{name}}VectorData *ud
             bint use_na_value
 
-        if {{return_inverse}}:
+        if return_inverse:
             labels = np.empty(n, dtype=np.int64)
         ud = uniques.data
         use_na_value = na_value is not None
@@ -499,8 +422,8 @@ unique_funcs = [('_unique_no_inverse', False, False),
             for i in range(n):
                 val = values[i]
 
-                if {{ignore_na}} and (val != val
-                                      or (use_na_value and val == na_value2)):
+                if ignore_na and (val != val
+                                  or (use_na_value and val == na_value2)):
                     labels[i] = na_sentinel
                     continue
 
@@ -518,27 +441,83 @@ unique_funcs = [('_unique_no_inverse', False, False),
                                                  "Vector.resize() needed")
                             uniques.resize()
                     append_data_{{dtype}}(ud, val)
-                    if {{return_inverse}}:
+                    if return_inverse:
                         self.table.vals[k] = count
                         labels[i] = count
                         count += 1
-                elif {{return_inverse}}:
+                elif return_inverse:
                     # k falls into a previous bucket
                     # only relevant in case we need to construct the inverse
                     idx = self.table.vals[k]
                     labels[i] = idx
 
-{{if func_name == '_unique_no_inverse'}}
+        if return_inverse:
+            return uniques.to_array(), np.asarray(labels)
         return uniques.to_array()
-{{elif func_name == '_unique_with_inverse'}}
-        return uniques.to_array(), np.asarray(labels)
-{{elif func_name == '_factorize'}}
-        return np.asarray(labels), uniques.to_array()
-{{elif func_name == 'get_labels'}}
-        return np.asarray(labels)
-{{endif}}
 
-{{endfor}}
+    def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
+        """
+        Calculate unique values and labels (no sorting!)
+
+        Parameters
+        ----------
+        values : ndarray[{{dtype}}]
+            Array of values of which unique will be calculated
+        return_inverse : boolean, default False
+            Whether the mapping of the original array values to their location
+            in the vector of uniques should be returned.
+
+        Returns
+        -------
+        uniques : ndarray[{{dtype}}]
+            Unique values of input, not sorted
+        labels : ndarray[int64] (if return_inverse)
+            The labels from values to uniques
+        """
+        uniques = {{name}}Vector()
+        return self._unique(values, uniques, ignore_na=False,
+                            return_inverse=return_inverse)
+
+    def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
+                  object na_value=None):
+        """
+        Calculate unique values and labels (no sorting!)
+
+        Missing values are not included in the "uniques" for this method.
+        The labels for any missing values will be set to "na_sentinel"
+
+        Parameters
+        ----------
+        values : ndarray[{{dtype}}]
+            Array of values of which unique will be calculated
+        na_sentinel : Py_ssize_t, default -1
+            Sentinel value used for all NA-values in inverse
+        na_value : object, default None
+            Value to identify as missing. If na_value is None, then
+            any value "val" satisfying val != val is considered missing.
+            If na_value is not None, then _additionally_, any value "val"
+            satisfying val == na_value is considered missing.
+
+        Returns
+        -------
+        labels : ndarray[int64]
+            The labels from values to uniques
+        uniques : ndarray[{{dtype}}]
+            Unique values of input, not sorted
+        """
+        uniques = {{name}}Vector()
+        # factorize has reversed outputs compared to _unique (see "[::-1]")
+        return self._unique(values, uniques, ignore_na=True,
+                            return_inverse=True, na_sentinel=na_sentinel,
+                            na_value=na_value)[::-1]
+
+    def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
+                   Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+                   object na_value=None):
+        _, labels = self._unique(values, uniques, ignore_na=True,
+                                 return_inverse=True, count_prior=count_prior,
+                                 na_sentinel=na_sentinel, na_value=na_value)
+        return labels
 
     @cython.boundscheck(False)
     def get_labels_groupby(self, const {{dtype}}_t[:] values):
@@ -725,85 +704,14 @@ cdef class StringHashTable(HashTable):
                 self.table.vals[k] = i
         free(vecs)
 
-    def unique(self, ndarray[object] values, bint return_inverse=False):
-        """
-        Calculate unique values and labels (no sorting!)
-
-        Parameters
-        ----------
-        values : ndarray[object]
-            Array of values of which unique will be calculated
-        return_inverse : boolean, default False
-            Whether the mapping of the original array values to their location
-            in the vector of uniques should be returned.
-
-        Returns
-        -------
-        uniques : ndarray[object]
-            Unique values of input, not sorted
-        labels : ndarray[int64] (if return_inverse)
-            The labels from values to uniques
-        """
-        uniques = ObjectVector()
-        # explicitly compile path without inverse for performance
-        if return_inverse:
-            return self._unique_with_inverse(values, uniques)
-        return self._unique_no_inverse(values, uniques)
-
-    def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
-                  object na_value=None):
-        """
-        Calculate unique values and labels (no sorting!)
-
-        Missing values are not included in the "uniques" for this method.
-        The labels for any missing values will be set to "na_sentinel"
-
-        Parameters
-        ----------
-        values : ndarray[object]
-            Array of values of which unique will be calculated
-        na_sentinel : Py_ssize_t, default -1
-            Sentinel value used for all NA-values in inverse
-        na_value : object, default None
-            Value to identify as missing. If na_value is None, then any value
-            that is not a string is considered missing. If na_value is
-            not None, then _additionally_ any value "val" satisfying
-            val == na_value is considered missing.
-
-        Returns
-        -------
-        labels : ndarray[int64]
-            The labels from values to uniques
-        uniques : ndarray[object]
-            Unique values of input, not sorted
-        """
-        # reduced signature compared to _factorize
-        # not necessary to have uniques-vector, count_prior
-        uniques = ObjectVector()
-        return self._factorize(values, uniques, 0, na_sentinel, na_value)
-
-{{py:
-# tuples of "func_name, return_inverse, ignore_na"
-unique_funcs = [('_unique_no_inverse', False, False),
-                ('_unique_with_inverse', True, False),
-                ('_factorize', True, True),
-                ('get_labels', True, True)]
-}}
-
-{{for func_name, return_inverse, ignore_na in unique_funcs}}
-
     @cython.boundscheck(False)
     @cython.wraparound(False)
-    def {{func_name}}(self, ndarray[object] values, ObjectVector uniques,
-                      Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
-                      object na_value=None):
+    def _unique(self, ndarray[object] values, ObjectVector uniques,
+                bint ignore_na=False, bint return_inverse=False,
+                Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+                object na_value=None):
         """
         Calculate unique values and labels (no sorting!)
-{{if ignore_na}}
-
-        Missing values are not included in the "uniques" for this method.
-        The labels for any missing values will be set to "na_sentinel"
-{{endif}}
 
         Parameters
         ----------
@@ -811,6 +719,13 @@ unique_funcs = [('_unique_no_inverse', False, False),
             Array of values of which unique will be calculated
         uniques : ObjectVector
             Vector into which uniques will be written
+        ignore_na : boolean, default False
+            Whether NA-values should be ignored for calculating the uniques. If
+            True, the labels corresponding to missing values will be set to
+            na_sentinel.
+        return_inverse : boolean, default False
+            Whether the mapping of the original array values to their location
+            in the vector of uniques should be returned.
         count_prior : Py_ssize_t, default 0
             Number of existing entries in uniques
         na_sentinel : Py_ssize_t, default -1
@@ -823,23 +738,10 @@ unique_funcs = [('_unique_no_inverse', False, False),
 
         Returns
         -------
-{{if func_name == '_unique_no_inverse'}}
-        uniques : ndarray[object]
-            Unique values of input, not sorted
-{{elif func_name == '_unique_with_inverse'}}
-        uniques : ndarray[object]
-            Unique values of input, not sorted
-        labels : ndarray[int64]
-            The labels from values to uniques
-{{elif func_name == '_factorize'  # switched output order for factorize}}
-        labels : ndarray[int64]
-            The labels from values to uniques
         uniques : ndarray[object]
             Unique values of input, not sorted
-{{elif func_name == 'get_labels'}}
-        labels : ndarray[int64]
+        labels : ndarray[int64] (if return_inverse=True)
             The labels from values to uniques
-{{endif}}
         """
         cdef:
             Py_ssize_t i, idx, count = count_prior, n = len(values)
@@ -852,7 +754,7 @@ unique_funcs = [('_unique_no_inverse', False, False),
             khiter_t k
             bint use_na_value
 
-        if {{return_inverse}}:
+        if return_inverse:
             labels = np.zeros(n, dtype=np.int64)
         uindexer = np.empty(n, dtype=np.int64)
         use_na_value = na_value is not None
@@ -862,7 +764,7 @@ unique_funcs = [('_unique_no_inverse', False, False),
         for i in range(n):
             val = values[i]
 
-            if ({{ignore_na}}
+            if (ignore_na
                 and (not (PyUnicode_Check(val) or PyString_Check(val))
                      or (use_na_value and val == na_value))):
                 # missing value
@@ -875,7 +777,7 @@ unique_funcs = [('_unique_no_inverse', False, False),
         # compute
         with nogil:
             for i in range(n):
-                if {{ignore_na}} and labels[i] == na_sentinel:
+                if ignore_na and labels[i] == na_sentinel:
                     continue
 
                 v = vecs[i]
@@ -884,11 +786,11 @@ unique_funcs = [('_unique_no_inverse', False, False),
                     # k hasn't been seen yet
                     k = kh_put_str(self.table, v, &ret)
                     uindexer[count] = i
-                    if {{return_inverse}}:
+                    if return_inverse:
                         self.table.vals[k] = count
                         labels[i] = <int64_t>count
                     count += 1
-                elif {{return_inverse}}:
+                elif return_inverse:
                     # k falls into a previous bucket
                     # only relevant in case we need to construct the inverse
                     idx = self.table.vals[k]
@@ -900,17 +802,73 @@ unique_funcs = [('_unique_no_inverse', False, False),
         for i in range(count):
             uniques.append(values[uindexer[i]])
 
-{{if func_name == '_unique_no_inverse'}}
+        if return_inverse:
+            return uniques.to_array(), np.asarray(labels)
         return uniques.to_array()
-{{elif func_name == '_unique_with_inverse'}}
-        return uniques.to_array(), np.asarray(labels)
-{{elif func_name == '_factorize'}}
-        return np.asarray(labels), uniques.to_array()
-{{elif func_name == 'get_labels'}}
-        return np.asarray(labels)
-{{endif}}
 
-{{endfor}}
+    def unique(self, ndarray[object] values, bint return_inverse=False):
+        """
+        Calculate unique values and labels (no sorting!)
+
+        Parameters
+        ----------
+        values : ndarray[object]
+            Array of values of which unique will be calculated
+        return_inverse : boolean, default False
+            Whether the mapping of the original array values to their location
+            in the vector of uniques should be returned.
+
+        Returns
+        -------
+        uniques : ndarray[object]
+            Unique values of input, not sorted
+        labels : ndarray[int64] (if return_inverse)
+            The labels from values to uniques
+        """
+        uniques = ObjectVector()
+        return self._unique(values, uniques, ignore_na=False,
+                            return_inverse=return_inverse)
+
+    def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
+                  object na_value=None):
+        """
+        Calculate unique values and labels (no sorting!)
+
+        Missing values are not included in the "uniques" for this method.
+        The labels for any missing values will be set to "na_sentinel"
+
+        Parameters
+        ----------
+        values : ndarray[object]
+            Array of values of which unique will be calculated
+        na_sentinel : Py_ssize_t, default -1
+            Sentinel value used for all NA-values in inverse
+        na_value : object, default None
+            Value to identify as missing. If na_value is None, then any value
+            that is not a string is considered missing. If na_value is
+            not None, then _additionally_ any value "val" satisfying
+            val == na_value is considered missing.
+
+        Returns
+        -------
+        labels : ndarray[int64]
+            The labels from values to uniques
+        uniques : ndarray[object]
+            Unique values of input, not sorted
+        """
+        uniques = ObjectVector()
+        # factorize has reversed outputs compared to _unique (see "[::-1]")
+        return self._unique(values, uniques, ignore_na=True,
+                            return_inverse=True, na_sentinel=na_sentinel,
+                            na_value=na_value)[::-1]
+
+    def get_labels(self, ndarray[object] values, ObjectVector uniques,
+                   Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+                   object na_value=None):
+        _, labels = self._unique(values, uniques, ignore_na=True,
+                                 return_inverse=True, count_prior=count_prior,
+                                 na_sentinel=na_sentinel, na_value=na_value)
+        return labels
 
 
 cdef class PyObjectHashTable(HashTable):
@@ -998,85 +956,14 @@ cdef class PyObjectHashTable(HashTable):
 
         return np.asarray(locs)
 
-    def unique(self, ndarray[object] values, bint return_inverse=False):
-        """
-        Calculate unique values and labels (no sorting!)
-
-        Parameters
-        ----------
-        values : ndarray[object]
-            Array of values of which unique will be calculated
-        return_inverse : boolean, default False
-            Whether the mapping of the original array values to their location
-            in the vector of uniques should be returned.
-
-        Returns
-        -------
-        uniques : ndarray[object]
-            Unique values of input, not sorted
-        labels : ndarray[int64] (if return_inverse)
-            The labels from values to uniques
-        """
-        uniques = ObjectVector()
-        # explicitly compile path without inverse for performance
-        if return_inverse:
-            return self._unique_with_inverse(values, uniques)
-        return self._unique_no_inverse(values, uniques)
-
-    def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
-                  object na_value=None):
-        """
-        Calculate unique values and labels (no sorting!)
-
-        Missing values are not included in the "uniques" for this method.
-        The labels for any missing values will be set to "na_sentinel"
-
-        Parameters
-        ----------
-        values : ndarray[object]
-            Array of values of which unique will be calculated
-        na_sentinel : Py_ssize_t, default -1
-            Sentinel value used for all NA-values in inverse
-        na_value : object, default None
-            Value to identify as missing. If na_value is None, then None _plus_
-            any value "val" satisfying val != val is considered missing.
-            If na_value is not None, then _additionally_, any value "val"
-            satisfying val == na_value is considered missing.
-
-        Returns
-        -------
-        labels : ndarray[int64]
-            The labels from values to uniques
-        uniques : ndarray[object]
-            Unique values of input, not sorted
-        """
-        # reduced signature compared to _factorize
-        # not necessary to have uniques-vector, count_prior
-        uniques = ObjectVector()
-        return self._factorize(values, uniques, 0, na_sentinel, na_value)
-
-{{py:
-# tuples of "func_name, return_inverse, ignore_na"
-unique_funcs = [('_unique_no_inverse', False, False),
-                ('_unique_with_inverse', True, False),
-                ('_factorize', True, True),
-                ('get_labels', True, True)]
-}}
-
-{{for func_name, return_inverse, ignore_na in unique_funcs}}
-
     @cython.boundscheck(False)
     @cython.wraparound(False)
-    def {{func_name}}(self, ndarray[object] values, ObjectVector uniques,
-                      Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
-                      object na_value=None):
+    def _unique(self, ndarray[object] values, ObjectVector uniques,
+                bint ignore_na=False, bint return_inverse=False,
+                Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+                object na_value=None):
         """
         Calculate unique values and labels (no sorting!)
-{{if ignore_na}}
-
-        Missing values are not included in the "uniques" for this method.
-        The labels for any missing values will be set to "na_sentinel"
-{{endif}}
 
         Parameters
         ----------
@@ -1084,6 +971,13 @@ unique_funcs = [('_unique_no_inverse', False, False),
             Array of values of which unique will be calculated
         uniques : ObjectVector
             Vector into which uniques will be written
+        ignore_na : boolean, default False
+            Whether NA-values should be ignored for calculating the uniques. If
+            True, the labels corresponding to missing values will be set to
+            na_sentinel.
+        return_inverse : boolean, default False
+            Whether the mapping of the original array values to their location
+            in the vector of uniques should be returned.
         count_prior : Py_ssize_t, default 0
             Number of existing entries in uniques
         na_sentinel : Py_ssize_t, default -1
@@ -1096,23 +990,10 @@ unique_funcs = [('_unique_no_inverse', False, False),
 
         Returns
         -------
-{{if func_name == '_unique_no_inverse'}}
         uniques : ndarray[object]
             Unique values of input, not sorted
-{{elif func_name == '_unique_with_inverse'}}
-        uniques : ndarray[object]
-            Unique values of input, not sorted
-        labels : ndarray[int64]
+        labels : ndarray[int64] (if return_inverse=True)
             The labels from values to uniques
-{{elif func_name == '_factorize'  # switched output order for factorize}}
-        labels : ndarray[int64]
-            The labels from values to uniques
-        uniques : ndarray[object]
-            Unique values of input, not sorted
-{{elif func_name == 'get_labels'}}
-        labels : ndarray[int64]
-            The labels from values to uniques
-{{endif}}
         """
         cdef:
             Py_ssize_t i, idx, count = count_prior, n = len(values)
@@ -1122,7 +1003,7 @@ unique_funcs = [('_unique_no_inverse', False, False),
             khiter_t k
             bint use_na_value
 
-        if {{return_inverse}}:
+        if return_inverse:
             labels = np.empty(n, dtype=np.int64)
         use_na_value = na_value is not None
 
@@ -1130,8 +1011,8 @@ unique_funcs = [('_unique_no_inverse', False, False),
             val = values[i]
             hash(val)
 
-            if {{ignore_na}} and ((val != val or val is None)
-                                  or (use_na_value and val == na_value)):
+            if ignore_na and ((val != val or val is None)
+                              or (use_na_value and val == na_value)):
                 labels[i] = na_sentinel
                 continue
 
@@ -1140,24 +1021,80 @@ unique_funcs = [('_unique_no_inverse', False, False),
                 # k hasn't been seen yet
                 k = kh_put_pymap(self.table, <PyObject*>val, &ret)
                 uniques.append(val)
-                if {{return_inverse}}:
+                if return_inverse:
                     self.table.vals[k] = count
                     labels[i] = count
                     count += 1
-            elif {{return_inverse}}:
+            elif return_inverse:
                 # k falls into a previous bucket
                 # only relevant in case we need to construct the inverse
                 idx = self.table.vals[k]
                 labels[i] = idx
 
-{{if func_name == '_unique_no_inverse'}}
+        if return_inverse:
+            return uniques.to_array(), np.asarray(labels)
         return uniques.to_array()
-{{elif func_name == '_unique_with_inverse'}}
-        return uniques.to_array(), np.asarray(labels)
-{{elif func_name == '_factorize'}}
-        return np.asarray(labels), uniques.to_array()
-{{elif func_name == 'get_labels'}}
-        return np.asarray(labels)
-{{endif}}
 
-{{endfor}}
+    def unique(self, ndarray[object] values, bint return_inverse=False):
+        """
+        Calculate unique values and labels (no sorting!)
+
+        Parameters
+        ----------
+        values : ndarray[object]
+            Array of values of which unique will be calculated
+        return_inverse : boolean, default False
+            Whether the mapping of the original array values to their location
+            in the vector of uniques should be returned.
+
+        Returns
+        -------
+        uniques : ndarray[object]
+            Unique values of input, not sorted
+        labels : ndarray[int64] (if return_inverse)
+            The labels from values to uniques
+        """
+        uniques = ObjectVector()
+        return self._unique(values, uniques, ignore_na=False,
+                            return_inverse=return_inverse)
+
+    def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
+                  object na_value=None):
+        """
+        Calculate unique values and labels (no sorting!)
+
+        Missing values are not included in the "uniques" for this method.
+        The labels for any missing values will be set to "na_sentinel"
+
+        Parameters
+        ----------
+        values : ndarray[object]
+            Array of values of which unique will be calculated
+        na_sentinel : Py_ssize_t, default -1
+            Sentinel value used for all NA-values in inverse
+        na_value : object, default None
+            Value to identify as missing. If na_value is None, then None _plus_
+            any value "val" satisfying val != val is considered missing.
+            If na_value is not None, then _additionally_, any value "val"
+            satisfying val == na_value is considered missing.
+
+        Returns
+        -------
+        labels : ndarray[int64]
+            The labels from values to uniques
+        uniques : ndarray[object]
+            Unique values of input, not sorted
+        """
+        uniques = ObjectVector()
+        # factorize has reversed outputs compared to _unique (see "[::-1]")
+        return self._unique(values, uniques, ignore_na=True,
+                            return_inverse=True, na_sentinel=na_sentinel,
+                            na_value=na_value)[::-1]
+
+    def get_labels(self, ndarray[object] values, ObjectVector uniques,
+                   Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
+                   object na_value=None):
+        _, labels = self._unique(values, uniques, ignore_na=True,
+                                 return_inverse=True, count_prior=count_prior,
+                                 na_sentinel=na_sentinel, na_value=na_value)
+        return labels

From a06494e33237e9a478340f5b5bcb4d006ff4f7a0 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Sat, 3 Nov 2018 16:50:52 +0100
Subject: [PATCH 08/17] Add new kwargs at the end (review jreback)

---
 pandas/_libs/hashtable_class_helper.pxi.in | 108 +++++++++++----------
 1 file changed, 57 insertions(+), 51 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index 99fc3337c73f6..ed2e6b9e65d3d 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -357,9 +357,9 @@ cdef class {{name}}HashTable(HashTable):
     @cython.boundscheck(False)
     @cython.wraparound(False)
     def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
-                bint ignore_na=False, bint return_inverse=False,
                 Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
-                object na_value=None):
+                object na_value=None, bint ignore_na=False,
+                bint return_inverse=False):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -369,13 +369,6 @@ cdef class {{name}}HashTable(HashTable):
             Array of values of which unique will be calculated
         uniques : {{name}}Vector
             Vector into which uniques will be written
-        ignore_na : boolean, default False
-            Whether NA-values should be ignored for calculating the uniques. If
-            True, the labels corresponding to missing values will be set to
-            na_sentinel.
-        return_inverse : boolean, default False
-            Whether the mapping of the original array values to their location
-            in the vector of uniques should be returned.
         count_prior : Py_ssize_t, default 0
             Number of existing entries in uniques
         na_sentinel : Py_ssize_t, default -1
@@ -385,6 +378,13 @@ cdef class {{name}}HashTable(HashTable):
             any value "val" satisfying val != val is considered missing.
             If na_value is not None, then _additionally_, any value "val"
             satisfying val == na_value is considered missing.
+        ignore_na : boolean, default False
+            Whether NA-values should be ignored for calculating the uniques. If
+            True, the labels corresponding to missing values will be set to
+            na_sentinel.
+        return_inverse : boolean, default False
+            Whether the mapping of the original array values to their location
+            in the vector of uniques should be returned.
 
         Returns
         -------
@@ -504,18 +504,20 @@ cdef class {{name}}HashTable(HashTable):
         uniques : ndarray[{{dtype}}]
             Unique values of input, not sorted
         """
-        uniques = {{name}}Vector()
-        # factorize has reversed outputs compared to _unique (see "[::-1]")
-        return self._unique(values, uniques, ignore_na=True,
-                            return_inverse=True, na_sentinel=na_sentinel,
-                            na_value=na_value)[::-1]
+        uniques_vector = {{name}}Vector()
+        uniques, labels = self._unique(values, uniques_vector,
+                                       na_sentinel=na_sentinel,
+                                       na_value=na_value, ignore_na=True,
+                                       return_inverse=True)
+        # factorize has reversed outputs compared to _unique
+        return labels, uniques
 
     def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
                    Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                    object na_value=None):
-        _, labels = self._unique(values, uniques, ignore_na=True,
-                                 return_inverse=True, count_prior=count_prior,
-                                 na_sentinel=na_sentinel, na_value=na_value)
+        _, labels = self._unique(values, uniques, count_prior=count_prior,
+                                 na_sentinel=na_sentinel, na_value=na_value,
+                                 ignore_na=True, return_inverse=True)
         return labels
 
     @cython.boundscheck(False)
@@ -706,9 +708,9 @@ cdef class StringHashTable(HashTable):
     @cython.boundscheck(False)
     @cython.wraparound(False)
     def _unique(self, ndarray[object] values, ObjectVector uniques,
-                bint ignore_na=False, bint return_inverse=False,
                 Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
-                object na_value=None):
+                object na_value=None, bint ignore_na=False,
+                bint return_inverse=False):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -718,13 +720,6 @@ cdef class StringHashTable(HashTable):
             Array of values of which unique will be calculated
         uniques : ObjectVector
             Vector into which uniques will be written
-        ignore_na : boolean, default False
-            Whether NA-values should be ignored for calculating the uniques. If
-            True, the labels corresponding to missing values will be set to
-            na_sentinel.
-        return_inverse : boolean, default False
-            Whether the mapping of the original array values to their location
-            in the vector of uniques should be returned.
         count_prior : Py_ssize_t, default 0
             Number of existing entries in uniques
         na_sentinel : Py_ssize_t, default -1
@@ -734,6 +729,13 @@ cdef class StringHashTable(HashTable):
             that is not a string is considered missing. If na_value is
             not None, then _additionally_ any value "val" satisfying
             val == na_value is considered missing.
+        ignore_na : boolean, default False
+            Whether NA-values should be ignored for calculating the uniques. If
+            True, the labels corresponding to missing values will be set to
+            na_sentinel.
+        return_inverse : boolean, default False
+            Whether the mapping of the original array values to their location
+            in the vector of uniques should be returned.
 
         Returns
         -------
@@ -855,18 +857,20 @@ cdef class StringHashTable(HashTable):
         uniques : ndarray[object]
             Unique values of input, not sorted
         """
-        uniques = ObjectVector()
-        # factorize has reversed outputs compared to _unique (see "[::-1]")
-        return self._unique(values, uniques, ignore_na=True,
-                            return_inverse=True, na_sentinel=na_sentinel,
-                            na_value=na_value)[::-1]
+        uniques_vector = ObjectVector()
+        uniques, labels = self._unique(values, uniques_vector,
+                                       na_sentinel=na_sentinel,
+                                       na_value=na_value, ignore_na=True,
+                                       return_inverse=True)
+        # factorize has reversed outputs compared to _unique
+        return labels, uniques
 
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
                    Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                    object na_value=None):
-        _, labels = self._unique(values, uniques, ignore_na=True,
-                                 return_inverse=True, count_prior=count_prior,
-                                 na_sentinel=na_sentinel, na_value=na_value)
+        _, labels = self._unique(values, uniques, count_prior=count_prior,
+                                 na_sentinel=na_sentinel, na_value=na_value,
+                                 ignore_na=True, return_inverse=True)
         return labels
 
 
@@ -958,9 +962,9 @@ cdef class PyObjectHashTable(HashTable):
     @cython.boundscheck(False)
     @cython.wraparound(False)
     def _unique(self, ndarray[object] values, ObjectVector uniques,
-                bint ignore_na=False, bint return_inverse=False,
                 Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
-                object na_value=None):
+                object na_value=None, bint ignore_na=False,
+                bint return_inverse=False):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -970,13 +974,6 @@ cdef class PyObjectHashTable(HashTable):
             Array of values of which unique will be calculated
         uniques : ObjectVector
             Vector into which uniques will be written
-        ignore_na : boolean, default False
-            Whether NA-values should be ignored for calculating the uniques. If
-            True, the labels corresponding to missing values will be set to
-            na_sentinel.
-        return_inverse : boolean, default False
-            Whether the mapping of the original array values to their location
-            in the vector of uniques should be returned.
         count_prior : Py_ssize_t, default 0
             Number of existing entries in uniques
         na_sentinel : Py_ssize_t, default -1
@@ -986,6 +983,13 @@ cdef class PyObjectHashTable(HashTable):
             any value "val" satisfying val != val is considered missing.
             If na_value is not None, then _additionally_, any value "val"
             satisfying val == na_value is considered missing.
+        ignore_na : boolean, default False
+            Whether NA-values should be ignored for calculating the uniques. If
+            True, the labels corresponding to missing values will be set to
+            na_sentinel.
+        return_inverse : boolean, default False
+            Whether the mapping of the original array values to their location
+            in the vector of uniques should be returned.
 
         Returns
         -------
@@ -1084,16 +1088,18 @@ cdef class PyObjectHashTable(HashTable):
         uniques : ndarray[object]
             Unique values of input, not sorted
         """
-        uniques = ObjectVector()
-        # factorize has reversed outputs compared to _unique (see "[::-1]")
-        return self._unique(values, uniques, ignore_na=True,
-                            return_inverse=True, na_sentinel=na_sentinel,
-                            na_value=na_value)[::-1]
+        uniques_vector = ObjectVector()
+        uniques, labels = self._unique(values, uniques_vector,
+                                       na_sentinel=na_sentinel,
+                                       na_value=na_value, ignore_na=True,
+                                       return_inverse=True)
+        # factorize has reversed outputs compared to _unique
+        return labels, uniques
 
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
                    Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                    object na_value=None):
-        _, labels = self._unique(values, uniques, ignore_na=True,
-                                 return_inverse=True, count_prior=count_prior,
-                                 na_sentinel=na_sentinel, na_value=na_value)
+        _, labels = self._unique(values, uniques, count_prior=count_prior,
+                                 na_sentinel=na_sentinel, na_value=na_value,
+                                 ignore_na=True, return_inverse=True)
         return labels

From 906a2b9fa52c808a2f56661afa2ceb8d9915c7f8 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Sun, 11 Nov 2018 11:14:32 +0100
Subject: [PATCH 09/17] Retrigger CircleCI


From 29aecdd1d223b63df1505d946cd900c333be5265 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Sun, 11 Nov 2018 18:16:46 +0100
Subject: [PATCH 10/17] Retrigger CI after flaky hypothesis test


From 746c0e393876ca974d96067a626125622b72070b Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Sun, 11 Nov 2018 19:14:36 +0100
Subject: [PATCH 11/17] Retrigger CircleCI


From 8da33f4ab68ee1b9fd10806172b9ffc6c8392b0e Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Sun, 11 Nov 2018 22:36:23 +0100
Subject: [PATCH 12/17] Retrigger CI after timeout


From ba9d8b805d57080969e71567b8df89e35e87afb2 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Sun, 11 Nov 2018 22:38:13 +0100
Subject: [PATCH 13/17] Retrigger CircleCI


From 30de418936cda9aa9c26a8cfa4c3a0b42906e2b2 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Sun, 18 Nov 2018 14:49:23 +0100
Subject: [PATCH 14/17] Always calculate inverse

---
 pandas/_libs/hashtable_class_helper.pxi.in | 106 ++++++++-------------
 1 file changed, 41 insertions(+), 65 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index c26e1e5d102d7..cf85a9f20e2c5 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -358,8 +358,7 @@ cdef class {{name}}HashTable(HashTable):
     @cython.wraparound(False)
     def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
                 Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
-                object na_value=None, bint ignore_na=False,
-                bint return_inverse=False):
+                object na_value=None, bint ignore_na=False):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -382,15 +381,12 @@ cdef class {{name}}HashTable(HashTable):
             Whether NA-values should be ignored for calculating the uniques. If
             True, the labels corresponding to missing values will be set to
             na_sentinel.
-        return_inverse : boolean, default False
-            Whether the mapping of the original array values to their location
-            in the vector of uniques should be returned.
 
         Returns
         -------
         uniques : ndarray[{{dtype}}]
             Unique values of input, not sorted
-        labels : ndarray[int64] (if return_inverse=True)
+        labels : ndarray[int64]
             The labels from values to uniques
         """
         cdef:
@@ -402,8 +398,7 @@ cdef class {{name}}HashTable(HashTable):
             {{name}}VectorData *ud
             bint use_na_value
 
-        if return_inverse:
-            labels = np.empty(n, dtype=np.int64)
+        labels = np.empty(n, dtype=np.int64)
         ud = uniques.data
         use_na_value = na_value is not None
 
@@ -440,19 +435,15 @@ cdef class {{name}}HashTable(HashTable):
                                                  "Vector.resize() needed")
                             uniques.resize()
                     append_data_{{dtype}}(ud, val)
-                    if return_inverse:
-                        self.table.vals[k] = count
-                        labels[i] = count
-                        count += 1
-                elif return_inverse:
+                    self.table.vals[k] = count
+                    labels[i] = count
+                    count += 1
+                else:
                     # k falls into a previous bucket
-                    # only relevant in case we need to construct the inverse
                     idx = self.table.vals[k]
                     labels[i] = idx
 
-        if return_inverse:
-            return uniques.to_array(), np.asarray(labels)
-        return uniques.to_array()
+        return uniques.to_array(), np.asarray(labels)
 
     def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
         """
@@ -474,8 +465,10 @@ cdef class {{name}}HashTable(HashTable):
             The labels from values to uniques
         """
         uniques = {{name}}Vector()
-        return self._unique(values, uniques, ignore_na=False,
-                            return_inverse=return_inverse)
+        uniques, inverse =  self._unique(values, uniques, ignore_na=False)
+        if return_inverse:
+            return uniques, inverse
+        return uniques
 
     def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
                   object na_value=None):
@@ -507,8 +500,7 @@ cdef class {{name}}HashTable(HashTable):
         uniques_vector = {{name}}Vector()
         uniques, labels = self._unique(values, uniques_vector,
                                        na_sentinel=na_sentinel,
-                                       na_value=na_value, ignore_na=True,
-                                       return_inverse=True)
+                                       na_value=na_value, ignore_na=True)
         # factorize has reversed outputs compared to _unique
         return labels, uniques
 
@@ -517,7 +509,7 @@ cdef class {{name}}HashTable(HashTable):
                    object na_value=None):
         _, labels = self._unique(values, uniques, count_prior=count_prior,
                                  na_sentinel=na_sentinel, na_value=na_value,
-                                 ignore_na=True, return_inverse=True)
+                                 ignore_na=True)
         return labels
 
     @cython.boundscheck(False)
@@ -709,8 +701,7 @@ cdef class StringHashTable(HashTable):
     @cython.wraparound(False)
     def _unique(self, ndarray[object] values, ObjectVector uniques,
                 Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
-                object na_value=None, bint ignore_na=False,
-                bint return_inverse=False):
+                object na_value=None, bint ignore_na=False):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -733,15 +724,12 @@ cdef class StringHashTable(HashTable):
             Whether NA-values should be ignored for calculating the uniques. If
             True, the labels corresponding to missing values will be set to
             na_sentinel.
-        return_inverse : boolean, default False
-            Whether the mapping of the original array values to their location
-            in the vector of uniques should be returned.
 
         Returns
         -------
         uniques : ndarray[object]
             Unique values of input, not sorted
-        labels : ndarray[int64] (if return_inverse=True)
+        labels : ndarray[int64]
             The labels from values to uniques
         """
         cdef:
@@ -755,8 +743,7 @@ cdef class StringHashTable(HashTable):
             khiter_t k
             bint use_na_value
 
-        if return_inverse:
-            labels = np.zeros(n, dtype=np.int64)
+        labels = np.zeros(n, dtype=np.int64)
         uindexer = np.empty(n, dtype=np.int64)
         use_na_value = na_value is not None
 
@@ -787,13 +774,11 @@ cdef class StringHashTable(HashTable):
                     # k hasn't been seen yet
                     k = kh_put_str(self.table, v, &ret)
                     uindexer[count] = i
-                    if return_inverse:
-                        self.table.vals[k] = count
-                        labels[i] = <int64_t>count
+                    self.table.vals[k] = count
+                    labels[i] = <int64_t>count
                     count += 1
-                elif return_inverse:
+                else:
                     # k falls into a previous bucket
-                    # only relevant in case we need to construct the inverse
                     idx = self.table.vals[k]
                     labels[i] = <int64_t>idx
 
@@ -803,9 +788,7 @@ cdef class StringHashTable(HashTable):
         for i in range(count):
             uniques.append(values[uindexer[i]])
 
-        if return_inverse:
-            return uniques.to_array(), np.asarray(labels)
-        return uniques.to_array()
+        return uniques.to_array(), np.asarray(labels)
 
     def unique(self, ndarray[object] values, bint return_inverse=False):
         """
@@ -827,8 +810,10 @@ cdef class StringHashTable(HashTable):
             The labels from values to uniques
         """
         uniques = ObjectVector()
-        return self._unique(values, uniques, ignore_na=False,
-                            return_inverse=return_inverse)
+        uniques, inverse =  self._unique(values, uniques, ignore_na=False)
+        if return_inverse:
+            return uniques, inverse
+        return uniques
 
     def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
                   object na_value=None):
@@ -860,8 +845,7 @@ cdef class StringHashTable(HashTable):
         uniques_vector = ObjectVector()
         uniques, labels = self._unique(values, uniques_vector,
                                        na_sentinel=na_sentinel,
-                                       na_value=na_value, ignore_na=True,
-                                       return_inverse=True)
+                                       na_value=na_value, ignore_na=True)
         # factorize has reversed outputs compared to _unique
         return labels, uniques
 
@@ -870,7 +854,7 @@ cdef class StringHashTable(HashTable):
                    object na_value=None):
         _, labels = self._unique(values, uniques, count_prior=count_prior,
                                  na_sentinel=na_sentinel, na_value=na_value,
-                                 ignore_na=True, return_inverse=True)
+                                 ignore_na=True)
         return labels
 
 
@@ -963,8 +947,7 @@ cdef class PyObjectHashTable(HashTable):
     @cython.wraparound(False)
     def _unique(self, ndarray[object] values, ObjectVector uniques,
                 Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
-                object na_value=None, bint ignore_na=False,
-                bint return_inverse=False):
+                object na_value=None, bint ignore_na=False):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -987,15 +970,12 @@ cdef class PyObjectHashTable(HashTable):
             Whether NA-values should be ignored for calculating the uniques. If
             True, the labels corresponding to missing values will be set to
             na_sentinel.
-        return_inverse : boolean, default False
-            Whether the mapping of the original array values to their location
-            in the vector of uniques should be returned.
 
         Returns
         -------
         uniques : ndarray[object]
             Unique values of input, not sorted
-        labels : ndarray[int64] (if return_inverse=True)
+        labels : ndarray[int64]
             The labels from values to uniques
         """
         cdef:
@@ -1006,8 +986,7 @@ cdef class PyObjectHashTable(HashTable):
             khiter_t k
             bint use_na_value
 
-        if return_inverse:
-            labels = np.empty(n, dtype=np.int64)
+        labels = np.empty(n, dtype=np.int64)
         use_na_value = na_value is not None
 
         for i in range(n):
@@ -1024,19 +1003,15 @@ cdef class PyObjectHashTable(HashTable):
                 # k hasn't been seen yet
                 k = kh_put_pymap(self.table, <PyObject*>val, &ret)
                 uniques.append(val)
-                if return_inverse:
-                    self.table.vals[k] = count
-                    labels[i] = count
-                    count += 1
-            elif return_inverse:
+                self.table.vals[k] = count
+                labels[i] = count
+                count += 1
+            else:
                 # k falls into a previous bucket
-                # only relevant in case we need to construct the inverse
                 idx = self.table.vals[k]
                 labels[i] = idx
 
-        if return_inverse:
-            return uniques.to_array(), np.asarray(labels)
-        return uniques.to_array()
+        return uniques.to_array(), np.asarray(labels)
 
     def unique(self, ndarray[object] values, bint return_inverse=False):
         """
@@ -1058,8 +1033,10 @@ cdef class PyObjectHashTable(HashTable):
             The labels from values to uniques
         """
         uniques = ObjectVector()
-        return self._unique(values, uniques, ignore_na=False,
-                            return_inverse=return_inverse)
+        uniques, inverse =  self._unique(values, uniques, ignore_na=False)
+        if return_inverse:
+            return uniques, inverse
+        return uniques
 
     def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
                   object na_value=None):
@@ -1091,8 +1068,7 @@ cdef class PyObjectHashTable(HashTable):
         uniques_vector = ObjectVector()
         uniques, labels = self._unique(values, uniques_vector,
                                        na_sentinel=na_sentinel,
-                                       na_value=na_value, ignore_na=True,
-                                       return_inverse=True)
+                                       na_value=na_value, ignore_na=True)
         # factorize has reversed outputs compared to _unique
         return labels, uniques
 
@@ -1101,5 +1077,5 @@ cdef class PyObjectHashTable(HashTable):
                    object na_value=None):
         _, labels = self._unique(values, uniques, count_prior=count_prior,
                                  na_sentinel=na_sentinel, na_value=na_value,
-                                 ignore_na=True, return_inverse=True)
+                                 ignore_na=True)
         return labels

From 0b85759f9c1ec9d2a5f7f242972d34e2d9704476 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Mon, 19 Nov 2018 23:36:48 +0100
Subject: [PATCH 15/17] Revert "Always calculate inverse"

This reverts commit 30de418936cda9aa9c26a8cfa4c3a0b42906e2b2.
---
 pandas/_libs/hashtable_class_helper.pxi.in | 106 +++++++++++++--------
 1 file changed, 65 insertions(+), 41 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index cf85a9f20e2c5..c26e1e5d102d7 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -358,7 +358,8 @@ cdef class {{name}}HashTable(HashTable):
     @cython.wraparound(False)
     def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
                 Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
-                object na_value=None, bint ignore_na=False):
+                object na_value=None, bint ignore_na=False,
+                bint return_inverse=False):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -381,12 +382,15 @@ cdef class {{name}}HashTable(HashTable):
             Whether NA-values should be ignored for calculating the uniques. If
             True, the labels corresponding to missing values will be set to
             na_sentinel.
+        return_inverse : boolean, default False
+            Whether the mapping of the original array values to their location
+            in the vector of uniques should be returned.
 
         Returns
         -------
         uniques : ndarray[{{dtype}}]
             Unique values of input, not sorted
-        labels : ndarray[int64]
+        labels : ndarray[int64] (if return_inverse=True)
             The labels from values to uniques
         """
         cdef:
@@ -398,7 +402,8 @@ cdef class {{name}}HashTable(HashTable):
             {{name}}VectorData *ud
             bint use_na_value
 
-        labels = np.empty(n, dtype=np.int64)
+        if return_inverse:
+            labels = np.empty(n, dtype=np.int64)
         ud = uniques.data
         use_na_value = na_value is not None
 
@@ -435,15 +440,19 @@ cdef class {{name}}HashTable(HashTable):
                                                  "Vector.resize() needed")
                             uniques.resize()
                     append_data_{{dtype}}(ud, val)
-                    self.table.vals[k] = count
-                    labels[i] = count
-                    count += 1
-                else:
+                    if return_inverse:
+                        self.table.vals[k] = count
+                        labels[i] = count
+                        count += 1
+                elif return_inverse:
                     # k falls into a previous bucket
+                    # only relevant in case we need to construct the inverse
                     idx = self.table.vals[k]
                     labels[i] = idx
 
-        return uniques.to_array(), np.asarray(labels)
+        if return_inverse:
+            return uniques.to_array(), np.asarray(labels)
+        return uniques.to_array()
 
     def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
         """
@@ -465,10 +474,8 @@ cdef class {{name}}HashTable(HashTable):
             The labels from values to uniques
         """
         uniques = {{name}}Vector()
-        uniques, inverse =  self._unique(values, uniques, ignore_na=False)
-        if return_inverse:
-            return uniques, inverse
-        return uniques
+        return self._unique(values, uniques, ignore_na=False,
+                            return_inverse=return_inverse)
 
     def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
                   object na_value=None):
@@ -500,7 +507,8 @@ cdef class {{name}}HashTable(HashTable):
         uniques_vector = {{name}}Vector()
         uniques, labels = self._unique(values, uniques_vector,
                                        na_sentinel=na_sentinel,
-                                       na_value=na_value, ignore_na=True)
+                                       na_value=na_value, ignore_na=True,
+                                       return_inverse=True)
         # factorize has reversed outputs compared to _unique
         return labels, uniques
 
@@ -509,7 +517,7 @@ cdef class {{name}}HashTable(HashTable):
                    object na_value=None):
         _, labels = self._unique(values, uniques, count_prior=count_prior,
                                  na_sentinel=na_sentinel, na_value=na_value,
-                                 ignore_na=True)
+                                 ignore_na=True, return_inverse=True)
         return labels
 
     @cython.boundscheck(False)
@@ -701,7 +709,8 @@ cdef class StringHashTable(HashTable):
     @cython.wraparound(False)
     def _unique(self, ndarray[object] values, ObjectVector uniques,
                 Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
-                object na_value=None, bint ignore_na=False):
+                object na_value=None, bint ignore_na=False,
+                bint return_inverse=False):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -724,12 +733,15 @@ cdef class StringHashTable(HashTable):
             Whether NA-values should be ignored for calculating the uniques. If
             True, the labels corresponding to missing values will be set to
             na_sentinel.
+        return_inverse : boolean, default False
+            Whether the mapping of the original array values to their location
+            in the vector of uniques should be returned.
 
         Returns
         -------
         uniques : ndarray[object]
             Unique values of input, not sorted
-        labels : ndarray[int64]
+        labels : ndarray[int64] (if return_inverse=True)
             The labels from values to uniques
         """
         cdef:
@@ -743,7 +755,8 @@ cdef class StringHashTable(HashTable):
             khiter_t k
             bint use_na_value
 
-        labels = np.zeros(n, dtype=np.int64)
+        if return_inverse:
+            labels = np.zeros(n, dtype=np.int64)
         uindexer = np.empty(n, dtype=np.int64)
         use_na_value = na_value is not None
 
@@ -774,11 +787,13 @@ cdef class StringHashTable(HashTable):
                     # k hasn't been seen yet
                     k = kh_put_str(self.table, v, &ret)
                     uindexer[count] = i
-                    self.table.vals[k] = count
-                    labels[i] = <int64_t>count
+                    if return_inverse:
+                        self.table.vals[k] = count
+                        labels[i] = <int64_t>count
                     count += 1
-                else:
+                elif return_inverse:
                     # k falls into a previous bucket
+                    # only relevant in case we need to construct the inverse
                     idx = self.table.vals[k]
                     labels[i] = <int64_t>idx
 
@@ -788,7 +803,9 @@ cdef class StringHashTable(HashTable):
         for i in range(count):
             uniques.append(values[uindexer[i]])
 
-        return uniques.to_array(), np.asarray(labels)
+        if return_inverse:
+            return uniques.to_array(), np.asarray(labels)
+        return uniques.to_array()
 
     def unique(self, ndarray[object] values, bint return_inverse=False):
         """
@@ -810,10 +827,8 @@ cdef class StringHashTable(HashTable):
             The labels from values to uniques
         """
         uniques = ObjectVector()
-        uniques, inverse =  self._unique(values, uniques, ignore_na=False)
-        if return_inverse:
-            return uniques, inverse
-        return uniques
+        return self._unique(values, uniques, ignore_na=False,
+                            return_inverse=return_inverse)
 
     def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
                   object na_value=None):
@@ -845,7 +860,8 @@ cdef class StringHashTable(HashTable):
         uniques_vector = ObjectVector()
         uniques, labels = self._unique(values, uniques_vector,
                                        na_sentinel=na_sentinel,
-                                       na_value=na_value, ignore_na=True)
+                                       na_value=na_value, ignore_na=True,
+                                       return_inverse=True)
         # factorize has reversed outputs compared to _unique
         return labels, uniques
 
@@ -854,7 +870,7 @@ cdef class StringHashTable(HashTable):
                    object na_value=None):
         _, labels = self._unique(values, uniques, count_prior=count_prior,
                                  na_sentinel=na_sentinel, na_value=na_value,
-                                 ignore_na=True)
+                                 ignore_na=True, return_inverse=True)
         return labels
 
 
@@ -947,7 +963,8 @@ cdef class PyObjectHashTable(HashTable):
     @cython.wraparound(False)
     def _unique(self, ndarray[object] values, ObjectVector uniques,
                 Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
-                object na_value=None, bint ignore_na=False):
+                object na_value=None, bint ignore_na=False,
+                bint return_inverse=False):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -970,12 +987,15 @@ cdef class PyObjectHashTable(HashTable):
             Whether NA-values should be ignored for calculating the uniques. If
             True, the labels corresponding to missing values will be set to
             na_sentinel.
+        return_inverse : boolean, default False
+            Whether the mapping of the original array values to their location
+            in the vector of uniques should be returned.
 
         Returns
         -------
         uniques : ndarray[object]
             Unique values of input, not sorted
-        labels : ndarray[int64]
+        labels : ndarray[int64] (if return_inverse=True)
             The labels from values to uniques
         """
         cdef:
@@ -986,7 +1006,8 @@ cdef class PyObjectHashTable(HashTable):
             khiter_t k
             bint use_na_value
 
-        labels = np.empty(n, dtype=np.int64)
+        if return_inverse:
+            labels = np.empty(n, dtype=np.int64)
         use_na_value = na_value is not None
 
         for i in range(n):
@@ -1003,15 +1024,19 @@ cdef class PyObjectHashTable(HashTable):
                 # k hasn't been seen yet
                 k = kh_put_pymap(self.table, <PyObject*>val, &ret)
                 uniques.append(val)
-                self.table.vals[k] = count
-                labels[i] = count
-                count += 1
-            else:
+                if return_inverse:
+                    self.table.vals[k] = count
+                    labels[i] = count
+                    count += 1
+            elif return_inverse:
                 # k falls into a previous bucket
+                # only relevant in case we need to construct the inverse
                 idx = self.table.vals[k]
                 labels[i] = idx
 
-        return uniques.to_array(), np.asarray(labels)
+        if return_inverse:
+            return uniques.to_array(), np.asarray(labels)
+        return uniques.to_array()
 
     def unique(self, ndarray[object] values, bint return_inverse=False):
         """
@@ -1033,10 +1058,8 @@ cdef class PyObjectHashTable(HashTable):
             The labels from values to uniques
         """
         uniques = ObjectVector()
-        uniques, inverse =  self._unique(values, uniques, ignore_na=False)
-        if return_inverse:
-            return uniques, inverse
-        return uniques
+        return self._unique(values, uniques, ignore_na=False,
+                            return_inverse=return_inverse)
 
     def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
                   object na_value=None):
@@ -1068,7 +1091,8 @@ cdef class PyObjectHashTable(HashTable):
         uniques_vector = ObjectVector()
         uniques, labels = self._unique(values, uniques_vector,
                                        na_sentinel=na_sentinel,
-                                       na_value=na_value, ignore_na=True)
+                                       na_value=na_value, ignore_na=True,
+                                       return_inverse=True)
         # factorize has reversed outputs compared to _unique
         return labels, uniques
 
@@ -1077,5 +1101,5 @@ cdef class PyObjectHashTable(HashTable):
                    object na_value=None):
         _, labels = self._unique(values, uniques, count_prior=count_prior,
                                  na_sentinel=na_sentinel, na_value=na_value,
-                                 ignore_na=True)
+                                 ignore_na=True, return_inverse=True)
         return labels

From 44518545259e6415f198501b3450bc189c8a6aec Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Wed, 21 Nov 2018 22:43:02 +0100
Subject: [PATCH 16/17] Add comments to ignore_na branches (review jreback)

---
 pandas/_libs/hashtable_class_helper.pxi.in | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index c26e1e5d102d7..a0aef8808f664 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -423,6 +423,9 @@ cdef class {{name}}HashTable(HashTable):
 
                 if ignore_na and (val != val
                                   or (use_na_value and val == na_value2)):
+                    # if missing values do not count as unique values (i.e. if
+                    # ignore_na is True), skip the hashtable entry for them,
+                    # and replace the corresponding label with na_sentinel
                     labels[i] = na_sentinel
                     continue
 
@@ -768,7 +771,9 @@ cdef class StringHashTable(HashTable):
             if (ignore_na
                 and (not isinstance(val, (str, unicode))
                      or (use_na_value and val == na_value))):
-                # missing value
+                # if missing values do not count as unique values (i.e. if
+                # ignore_na is True), we can skip the actual value, and
+                # replace the label with na_sentinel directly
                 labels[i] = na_sentinel
             else:
                 # if ignore_na is False, we also stringify NaN/None/etc.
@@ -779,6 +784,7 @@ cdef class StringHashTable(HashTable):
         with nogil:
             for i in range(n):
                 if ignore_na and labels[i] == na_sentinel:
+                    # skip entries for ignored missing values (see above)
                     continue
 
                 v = vecs[i]
@@ -1016,6 +1022,9 @@ cdef class PyObjectHashTable(HashTable):
 
             if ignore_na and ((val != val or val is None)
                               or (use_na_value and val == na_value)):
+                # if missing values do not count as unique values (i.e. if
+                # ignore_na is True), skip the hashtable entry for them, and
+                # replace the corresponding label with na_sentinel
                 labels[i] = na_sentinel
                 continue
 

From 00a304d467483135ad2c035d218a562fbdbeecd4 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Tue, 27 Nov 2018 08:07:09 +0100
Subject: [PATCH 17/17] Switch signature of hashtable.factorize (review
 jreback)

---
 pandas/_libs/hashtable_class_helper.pxi.in | 39 +++++++++-------------
 pandas/core/algorithms.py                  |  2 +-
 pandas/tests/test_algos.py                 |  2 +-
 3 files changed, 17 insertions(+), 26 deletions(-)

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index a0aef8808f664..7f4c2a6410870 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -502,18 +502,15 @@ cdef class {{name}}HashTable(HashTable):
 
         Returns
         -------
-        labels : ndarray[int64]
-            The labels from values to uniques
         uniques : ndarray[{{dtype}}]
             Unique values of input, not sorted
+        labels : ndarray[int64]
+            The labels from values to uniques
         """
         uniques_vector = {{name}}Vector()
-        uniques, labels = self._unique(values, uniques_vector,
-                                       na_sentinel=na_sentinel,
-                                       na_value=na_value, ignore_na=True,
-                                       return_inverse=True)
-        # factorize has reversed outputs compared to _unique
-        return labels, uniques
+        return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
+                            na_value=na_value, ignore_na=True,
+                            return_inverse=True)
 
     def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
                    Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
@@ -858,18 +855,15 @@ cdef class StringHashTable(HashTable):
 
         Returns
         -------
-        labels : ndarray[int64]
-            The labels from values to uniques
         uniques : ndarray[object]
             Unique values of input, not sorted
+        labels : ndarray[int64]
+            The labels from values to uniques
         """
         uniques_vector = ObjectVector()
-        uniques, labels = self._unique(values, uniques_vector,
-                                       na_sentinel=na_sentinel,
-                                       na_value=na_value, ignore_na=True,
-                                       return_inverse=True)
-        # factorize has reversed outputs compared to _unique
-        return labels, uniques
+        return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
+                            na_value=na_value, ignore_na=True,
+                            return_inverse=True)
 
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
                    Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
@@ -1092,18 +1086,15 @@ cdef class PyObjectHashTable(HashTable):
 
         Returns
         -------
-        labels : ndarray[int64]
-            The labels from values to uniques
         uniques : ndarray[object]
             Unique values of input, not sorted
+        labels : ndarray[int64]
+            The labels from values to uniques
         """
         uniques_vector = ObjectVector()
-        uniques, labels = self._unique(values, uniques_vector,
-                                       na_sentinel=na_sentinel,
-                                       na_value=na_value, ignore_na=True,
-                                       return_inverse=True)
-        # factorize has reversed outputs compared to _unique
-        return labels, uniques
+        return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
+                            na_value=na_value, ignore_na=True,
+                            return_inverse=True)
 
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
                    Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 5f7995ac649a2..98cb45a4d4efc 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -460,7 +460,7 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None,
     (hash_klass, _), values = _get_data_algo(values, _hashtables)
 
     table = hash_klass(size_hint or len(values))
-    labels, uniques = table.factorize(values, na_sentinel=na_sentinel,
+    uniques, labels = table.factorize(values, na_sentinel=na_sentinel,
                                       na_value=na_value)
 
     labels = ensure_platform_int(labels)
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index c69eb056138c8..c9d403f6696af 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -1391,7 +1391,7 @@ def test_hashtable_factorize(self, htable, tm_dtype, writable):
         s_duplicated.values.setflags(write=writable)
         na_mask = s_duplicated.isna().values
 
-        result_inverse, result_unique = htable().factorize(s_duplicated.values)
+        result_unique, result_inverse = htable().factorize(s_duplicated.values)
 
         # drop_duplicates has own cython code (hash_table_func_helper.pxi)
         # and is tested separately; keeps first occurrence like ht.factorize()