From 858f54e5e4d5e84f2a45f7c5954cbde9fb6d9438 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 15 Oct 2018 23:38:34 +0200 Subject: [PATCH 01/17] Unify unique/factorize, remove kwargs (perf); enable inverse for unique --- pandas/_libs/hashtable.pyx | 3 +- pandas/_libs/hashtable_class_helper.pxi.in | 438 +++++++++++---------- pandas/core/algorithms.py | 5 +- pandas/tests/test_algos.py | 22 +- 4 files changed, 242 insertions(+), 226 deletions(-) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 2ced98198afc6..1d5f637a05188 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -124,8 +124,7 @@ cdef class Int64Factorizer: uniques.extend(self.uniques.to_array()) self.uniques = uniques labels = self.table.get_labels(values, self.uniques, - self.count, na_sentinel, - na_value=na_value) + self.count, na_sentinel, na_value) # sort on if sort: diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index c061102fbaddc..6052768f69e84 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -357,11 +357,12 @@ cdef class {{name}}HashTable(HashTable): @cython.boundscheck(False) @cython.wraparound(False) - def _factorize(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, - Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None): + def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, + bint ignore_na, bint return_uniques, + bint return_inverse, Py_ssize_t count_prior, + Py_ssize_t na_sentinel, object na_value): """ - Calculate unique values and labels (no sorting); ignores all NA-values + Calculate unique values and labels (no sorting!) Parameters ---------- @@ -369,19 +370,32 @@ cdef class {{name}}HashTable(HashTable): Array of values of which unique will be calculated uniques : {{name}}Vector Vector into which uniques will be written - count_prior : Py_ssize_t, default 0 + ignore_na : boolean + Whether NA-values should be ignored for calculating the uniques. If + True, the labels corresponding to missing values will be set to + na_sentinel. + return_uniques : boolean + Whether to return the content of the passed "uniques" vector as an + np.ndarray at the end. If False, the vector passed to "uniques" + must be explicitly read and transformed by the user. + return_inverse : boolean + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. + count_prior : Py_ssize_t Number of existing entries in uniques - na_sentinel : Py_ssize_t, default -1 + na_sentinel : Py_ssize_t Sentinel value used for all NA-values in inverse - na_value : object, default None + na_value : object Value to identify as missing. If na_value is None, then - any value satisfying val!=val are considered missing. + any value "val" satisfying val != val is considered missing. + If na_value is not None, then _additionally_, any value "val" + satisfying val == na_value is considered missing. Returns ------- - uniques : ndarray[{{dtype}}] + uniques : ndarray[{{dtype}}] (if return_uniques) Unique values of input, not sorted - labels : ndarray[int64] + labels : ndarray[int64] (if return_inverse=True) The labels from values to uniques """ cdef: @@ -393,7 +407,8 @@ cdef class {{name}}HashTable(HashTable): {{name}}VectorData *ud bint use_na_value - labels = np.empty(n, dtype=np.int64) + if return_inverse: + labels = np.empty(n, dtype=np.int64) ud = uniques.data use_na_value = na_value is not None @@ -411,20 +426,16 @@ cdef class {{name}}HashTable(HashTable): for i in range(n): val = values[i] - if val != val or (use_na_value and val == na_value2): + if ignore_na and (val != val + or (use_na_value and val == na_value2)): labels[i] = na_sentinel continue k = kh_get_{{dtype}}(self.table, val) - if k != self.table.n_buckets: - # k falls into a previous bucket - idx = self.table.vals[k] - labels[i] = idx - else: + if k == self.table.n_buckets: # k hasn't been seen yet k = kh_put_{{dtype}}(self.table, val, &ret) - self.table.vals[k] = count if needs_resize(ud): with gil: @@ -434,23 +445,54 @@ cdef class {{name}}HashTable(HashTable): "Vector.resize() needed") uniques.resize() append_data_{{dtype}}(ud, val) - labels[i] = count - count += 1 + if return_inverse: + self.table.vals[k] = count + labels[i] = count + count += 1 + elif return_inverse: + # k falls into a previous bucket + # only relevant in case we need to construct the inverse + idx = self.table.vals[k] + labels[i] = idx - return np.asarray(labels) + if return_uniques and return_inverse: + return uniques.to_array(), np.asarray(labels) + elif return_uniques: + return uniques.to_array() + elif return_inverse: + return np.asarray(labels) - def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1, - object na_value=None): + def unique(self, const {{dtype}}_t[:] values, bint return_inverse): uniques = {{name}}Vector() - labels = self._factorize(values, uniques=uniques, - na_sentinel=na_sentinel, na_value=na_value) - return labels, uniques.to_array() + return self._unique(values, uniques, + False, # ignore_na + True, # return_uniques + return_inverse, + # the rest are of the parameters are not relevant, + # but we don't use kwargs to avoid cython perf hit + 0, # count_prior + -1, # na_sentinel + None) # na_value + + def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel, + object na_value): + uniques = {{name}}Vector() + # factorize has reversed outputs compared to _unique (see "[::-1]") + return self._unique(values, uniques, + True, # ignore_na + True, # return_uniques + True, # return_inverse + 0, # count_prior + na_sentinel, na_value)[::-1] def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, - Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None): - return self._factorize(values, uniques, count_prior=count_prior, - na_sentinel=na_sentinel, na_value=na_value) + Py_ssize_t count_prior, Py_ssize_t na_sentinel, + object na_value): + return self._unique(values, uniques, + True, # ignore_na + False, # return_uniques + True, # return_inverse + count_prior, na_sentinel, na_value) @cython.boundscheck(False) def get_labels_groupby(self, const {{dtype}}_t[:] values): @@ -497,44 +539,6 @@ cdef class {{name}}HashTable(HashTable): return np.asarray(labels), arr_uniques - @cython.boundscheck(False) - @cython.wraparound(False) - def unique(self, const {{dtype}}_t[:] values): - """ - Calculate unique values without sorting - - Parameters - ---------- - values : ndarray[{{dtype}}] - Array of values of which unique will be calculated - - Returns - ------- - uniques : ndarray[{{dtype}}] - Unique values of input, not sorted - """ - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - {{dtype}}_t val - khiter_t k - {{name}}Vector uniques = {{name}}Vector() - {{name}}VectorData *ud - - ud = uniques.data - - with nogil: - for i in range(n): - val = values[i] - k = kh_get_{{dtype}}(self.table, val) - if k == self.table.n_buckets: - kh_put_{{dtype}}(self.table, val, &ret) - if needs_resize(ud): - with gil: - uniques.resize() - append_data_{{dtype}}(ud, val) - return uniques.to_array() - {{endfor}} @@ -614,56 +618,6 @@ cdef class StringHashTable(HashTable): free(vecs) return labels - @cython.boundscheck(False) - @cython.wraparound(False) - def unique(self, ndarray[object] values): - """ - Calculate unique values without sorting - - Parameters - ---------- - values : ndarray[object] - Array of values of which unique will be calculated - - Returns - ------- - uniques : ndarray[object] - Unique values of input, not sorted - """ - cdef: - Py_ssize_t i, count, n = len(values) - int64_t[:] uindexer - int ret = 0 - object val - ObjectVector uniques - khiter_t k - const char *v - const char **vecs - - vecs = malloc(n * sizeof(char *)) - uindexer = np.empty(n, dtype=np.int64) - for i in range(n): - val = values[i] - v = util.get_c_string(val) - vecs[i] = v - - count = 0 - with nogil: - for i in range(n): - v = vecs[i] - k = kh_get_str(self.table, v) - if k == self.table.n_buckets: - kh_put_str(self.table, v, &ret) - uindexer[count] = i - count += 1 - free(vecs) - - # uniques - uniques = ObjectVector() - for i in range(count): - uniques.append(values[uindexer[i]]) - return uniques.to_array() - @cython.boundscheck(False) def lookup(self, ndarray[object] values): cdef: @@ -727,11 +681,12 @@ cdef class StringHashTable(HashTable): @cython.boundscheck(False) @cython.wraparound(False) - def _factorize(self, ndarray[object] values, ObjectVector uniques, - Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None): + def _unique(self, ndarray[object] values, ObjectVector uniques, + bint ignore_na, bint return_uniques, bint return_inverse, + Py_ssize_t count_prior, Py_ssize_t na_sentinel, + object na_value): """ - Calculate unique values and labels (no sorting); ignores all NA-values + Calculate unique values and labels (no sorting) Parameters ---------- @@ -739,18 +694,32 @@ cdef class StringHashTable(HashTable): Array of values of which unique will be calculated uniques : ObjectVector Vector into which uniques will be written - count_prior : Py_ssize_t, default 0 + ignore_na : boolean + Whether NA-values should be ignored for calculating the uniques. If + True, the labels corresponding to missing values will be set to + na_sentinel. + return_uniques : boolean + Whether to return the content of the passed "uniques" vector as an + np.ndarray at the end. If False, the vector passed to "uniques" + must be explicitly read and transformed by the user. + return_inverse : boolean + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. + count_prior : Py_ssize_t Number of existing entries in uniques - na_sentinel : Py_ssize_t, default -1 + na_sentinel : Py_ssize_t Sentinel value used for all NA-values in inverse - na_value : object, default None - Value to identify as missing + na_value : object + Value to identify as missing. If na_value is None, then any value + that is not a string is considered missing. If na_value is + not None, then _additionally_ any value "val" satisfying + val == na_value is considered missing. Returns ------- - uniques : ndarray[object] + uniques : ndarray[object] (if return_uniques) Unique values of input, not sorted - labels : ndarray[int64] + labels : ndarray[int64] (if return_inverse) The labels from values to uniques """ cdef: @@ -764,41 +733,46 @@ cdef class StringHashTable(HashTable): khiter_t k bint use_na_value - labels = np.zeros(n, dtype=np.int64) + if return_inverse: + labels = np.zeros(n, dtype=np.int64) uindexer = np.empty(n, dtype=np.int64) use_na_value = na_value is not None - # assign pointers and pre-filter out missing + # assign pointers and pre-filter out missing (if ignore_na) vecs = malloc(n * sizeof(char *)) for i in range(n): val = values[i] - if ((PyUnicode_Check(val) or PyString_Check(val)) - and not (use_na_value and val == na_value)): + if ignore_na and (not (PyUnicode_Check(val) or PyString_Check(val)) + or (use_na_value and val == na_value)): + # missing value + labels[i] = na_sentinel + else: + # if ignore_na is False, we also stringify NaN/None/etc. v = util.get_c_string(val) vecs[i] = v - else: - labels[i] = na_sentinel # compute with nogil: for i in range(n): - if labels[i] == na_sentinel: + if ignore_na and labels[i] == na_sentinel: continue v = vecs[i] k = kh_get_str(self.table, v) - if k != self.table.n_buckets: - # k falls into a previous bucket - idx = self.table.vals[k] - labels[i] = idx - else: + if k == self.table.n_buckets: # k hasn't been seen yet k = kh_put_str(self.table, v, &ret) - self.table.vals[k] = count uindexer[count] = i - labels[i] = count + if return_inverse: + self.table.vals[k] = count + labels[i] = count count += 1 + elif return_inverse: + # k falls into a previous bucket + # only relevant in case we need to construct the inverse + idx = self.table.vals[k] + labels[i] = idx free(vecs) @@ -806,20 +780,44 @@ cdef class StringHashTable(HashTable): for i in range(count): uniques.append(values[uindexer[i]]) - return np.asarray(labels) + if return_uniques and return_inverse: + return uniques.to_array(), np.asarray(labels) + elif return_uniques: + return uniques.to_array() + elif return_inverse: + return np.asarray(labels) - def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, - object na_value=None): + def unique(self, ndarray[object] values, bint return_inverse=False): uniques = ObjectVector() - labels = self._factorize(values, uniques=uniques, - na_sentinel=na_sentinel, na_value=na_value) - return labels, uniques.to_array() + return self._unique(values, uniques, + False, # ignore_na + True, # return_uniques + return_inverse, + # the rest are of the parameters are not relevant, + # but we don't use kwargs to avoid cython perf hit + 0, # count_prior + -1, # na_sentinel + None) # na_value + + def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel, + object na_value): + uniques = ObjectVector() + # factorize has reversed outputs compared to _unique (see "[::-1]") + return self._unique(values, uniques, + True, # ignore_na + True, # return_uniques + True, # return_inverse + 0, # count_prior + na_sentinel, na_value)[::-1] def get_labels(self, ndarray[object] values, ObjectVector uniques, - Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None): - return self._factorize(values, uniques, count_prior=count_prior, - na_sentinel=na_sentinel, na_value=na_value) + Py_ssize_t count_prior, Py_ssize_t na_sentinel, + object na_value): + return self._unique(values, uniques, + True, # ignore_na + False, # return_uniques + True, # return_inverse + count_prior, na_sentinel, na_value) cdef class PyObjectHashTable(HashTable): @@ -909,44 +907,12 @@ cdef class PyObjectHashTable(HashTable): @cython.boundscheck(False) @cython.wraparound(False) - def unique(self, ndarray[object] values): - """ - Calculate unique values without sorting - - Parameters - ---------- - values : ndarray[object] - Array of values of which unique will be calculated - - Returns - ------- - uniques : ndarray[object] - Unique values of input, not sorted - """ - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - object val - khiter_t k - ObjectVector uniques = ObjectVector() - - for i in range(n): - val = values[i] - hash(val) - k = kh_get_pymap(self.table, val) - if k == self.table.n_buckets: - kh_put_pymap(self.table, val, &ret) - uniques.append(val) - - return uniques.to_array() - - @cython.boundscheck(False) - @cython.wraparound(False) - def _factorize(self, ndarray[object] values, ObjectVector uniques, - Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None): + def _unique(self, ndarray[object] values, ObjectVector uniques, + bint ignore_na, bint return_uniques, bint return_inverse, + Py_ssize_t count_prior, Py_ssize_t na_sentinel, + object na_value): """ - Calculate unique values and labels (no sorting); ignores all NA-values + Calculate unique values and labels (no sorting) Parameters ---------- @@ -954,19 +920,32 @@ cdef class PyObjectHashTable(HashTable): Array of values of which unique will be calculated uniques : ObjectVector Vector into which uniques will be written - count_prior : Py_ssize_t, default 0 + ignore_na : boolean + Whether NA-values should be ignored for calculating the uniques. If + True, the labels corresponding to missing values will be set to + na_sentinel. + return_uniques : boolean + Whether to return the content of the passed "uniques" vector as an + np.ndarray at the end. If False, the vector passed to "uniques" + must be explicitly read and transformed by the user. + return_inverse : boolean + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. + count_prior : Py_ssize_t Number of existing entries in uniques - na_sentinel : Py_ssize_t, default -1 + na_sentinel : Py_ssize_t Sentinel value used for all NA-values in inverse - na_value : object, default None + na_value : object Value to identify as missing. If na_value is None, then None _plus_ - any value satisfying val!=val are considered missing. + any value "val" satisfying val != val is considered missing. + If na_value is not None, then _additionally_, any value "val" + satisfying val == na_value is considered missing. Returns ------- - uniques : ndarray[object] + uniques : ndarray[object] (if return_uniques) Unique values of input, not sorted - labels : ndarray[int64] + labels : ndarray[int64] (if return_inverse) The labels from values to uniques """ cdef: @@ -977,42 +956,69 @@ cdef class PyObjectHashTable(HashTable): khiter_t k bint use_na_value - labels = np.empty(n, dtype=np.int64) + if return_inverse: + labels = np.empty(n, dtype=np.int64) use_na_value = na_value is not None for i in range(n): val = values[i] hash(val) - if ((val != val or val is None) - or (use_na_value and val == na_value)): + if ignore_na and ((val != val or val is None) + or (use_na_value and val == na_value)): labels[i] = na_sentinel continue k = kh_get_pymap(self.table, val) - if k != self.table.n_buckets: - # k falls into a previous bucket - idx = self.table.vals[k] - labels[i] = idx - else: + if k == self.table.n_buckets: # k hasn't been seen yet k = kh_put_pymap(self.table, val, &ret) - self.table.vals[k] = count uniques.append(val) - labels[i] = count - count += 1 + if return_inverse: + self.table.vals[k] = count + labels[i] = count + count += 1 + elif return_inverse: + # k falls into a previous bucket + # only relevant in case we need to construct the inverse + idx = self.table.vals[k] + labels[i] = idx - return np.asarray(labels) + if return_uniques and return_inverse: + return uniques.to_array(), np.asarray(labels) + elif return_uniques: + return uniques.to_array() + elif return_inverse: + return np.asarray(labels) - def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, - object na_value=None): + def unique(self, ndarray[object] values, bint return_inverse=False): + uniques = ObjectVector() + return self._unique(values, uniques, + False, # ignore_na + True, # return_uniques + return_inverse, + # the rest are of the parameters are not relevant, + # but we don't use kwargs to avoid cython perf hit + 0, # count_prior + -1, # na_sentinel + None) # na_value + + def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel, + object na_value): uniques = ObjectVector() - labels = self._factorize(values, uniques=uniques, - na_sentinel=na_sentinel, na_value=na_value) - return labels, uniques.to_array() + # factorize has reversed outputs compared to _unique (see "[::-1]") + return self._unique(values, uniques, + True, # ignore_na + True, # return_uniques + True, # return_inverse + 0, # count_prior + na_sentinel, na_value)[::-1] def get_labels(self, ndarray[object] values, ObjectVector uniques, - Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None): - return self._factorize(values, uniques, count_prior=count_prior, - na_sentinel=na_sentinel, na_value=na_value) + Py_ssize_t count_prior, Py_ssize_t na_sentinel, + object na_value): + return self._unique(values, uniques, + True, # ignore_na + False, # return_uniques + True, # return_inverse + count_prior, na_sentinel, na_value) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index df2da26685a16..42cff6c431e37 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -365,7 +365,7 @@ def unique(values): htable, _, values, dtype, ndtype = _get_hashtable_algo(values) table = htable(len(values)) - uniques = table.unique(values) + uniques = table.unique(values, False) uniques = _reconstruct_data(uniques, dtype, original) if isinstance(original, ABCSeries) and is_datetime64tz_dtype(dtype): @@ -470,8 +470,7 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None, (hash_klass, _), values = _get_data_algo(values, _hashtables) table = hash_klass(size_hint or len(values)) - labels, uniques = table.factorize(values, na_sentinel=na_sentinel, - na_value=na_value) + labels, uniques = table.factorize(values, na_sentinel, na_value) labels = ensure_platform_int(labels) return labels, uniques diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index d491df587fb4a..cf0273826aca3 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1318,7 +1318,7 @@ def test_vector_resize(self, writable, htable, uniques, dtype, uniques = uniques() # get_labels may append to uniques - htable.get_labels(vals[:nvals], uniques, 0, -1) + htable.get_labels(vals[:nvals], uniques, 0, -1, None) # to_array() sets an external_view_exists flag on uniques. tmp = uniques.to_array() oldshape = tmp.shape @@ -1326,10 +1326,10 @@ def test_vector_resize(self, writable, htable, uniques, dtype, # subsequent get_labels() calls can no longer append to it # (except for StringHashTables + ObjectVector) if safely_resizes: - htable.get_labels(vals, uniques, 0, -1) + htable.get_labels(vals, uniques, 0, -1, None) else: with tm.assert_raises_regex(ValueError, 'external reference.*'): - htable.get_labels(vals, uniques, 0, -1) + htable.get_labels(vals, uniques, 0, -1, None) uniques.to_array() # should not raise here assert tmp.shape == oldshape @@ -1358,9 +1358,18 @@ def test_hashtable_unique(self, htable, tm_dtype, writable): # drop_duplicates has own cython code (hash_table_func_helper.pxi) # and is tested separately; keeps first occurrence like ht.unique() expected_unique = s_duplicated.drop_duplicates(keep='first').values - result_unique = htable().unique(s_duplicated.values) + return_inverse = False + result_unique = htable().unique(s_duplicated.values, return_inverse) tm.assert_numpy_array_equal(result_unique, expected_unique) + # test with inverse + return_inverse = True + result_unique, result_inverse = htable().unique(s_duplicated.values, + return_inverse) + tm.assert_numpy_array_equal(result_unique, expected_unique) + reconstr = result_unique[result_inverse] + tm.assert_numpy_array_equal(reconstr, s_duplicated.values) + @pytest.mark.parametrize('htable, tm_dtype', [ (ht.PyObjectHashTable, 'String'), (ht.StringHashTable, 'String'), @@ -1383,7 +1392,10 @@ def test_hashtable_factorize(self, htable, tm_dtype, writable): s_duplicated.values.setflags(write=writable) na_mask = s_duplicated.isna().values - result_inverse, result_unique = htable().factorize(s_duplicated.values) + na_sentinel = -1 + na_value = None + result = htable().factorize(s_duplicated.values, na_sentinel, na_value) + result_inverse, result_unique = result # drop_duplicates has own cython code (hash_table_func_helper.pxi) # and is tested separately; keeps first occurrence like ht.factorize() From 4ed354a9544d0576aa1b98ffd26d3667677ed6ea Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 26 Oct 2018 18:33:51 +0200 Subject: [PATCH 02/17] Template over {return_inverse, ignore_na} for perf --- pandas/_libs/hashtable_class_helper.pxi.in | 505 +++++++++++++-------- 1 file changed, 325 insertions(+), 180 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 6052768f69e84..6b50c65e29a4b 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -355,14 +355,87 @@ cdef class {{name}}HashTable(HashTable): return np.asarray(locs) + def unique(self, const {{dtype}}_t[:] values, bint return_inverse): + """ + Calculate unique values and labels (no sorting!) + + Parameters + ---------- + values : ndarray[{{dtype}}] + Array of values of which unique will be calculated + return_inverse : boolean + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. + + Returns + ------- + uniques : ndarray[{{dtype}}] + Unique values of input, not sorted + labels : ndarray[int64] (if return_inverse) + The labels from values to uniques + """ + uniques = {{name}}Vector() + # explicitly compile path without inverse for performance + # the last three arguments are not relevant for this method, but we + # don't use kwargs to avoid cython perf hit (just using default values) + if return_inverse: + return self._unique_with_inverse(values, uniques, 0, -1, None) + return self._unique_no_inverse(values, uniques, 0, -1, None) + + def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel, + object na_value): + """ + Calculate unique values and labels (no sorting!) + + Missing values are not included in the "uniques" for this method. + The labels for any missing values will be set to "na_sentinel" + + Parameters + ---------- + values : ndarray[{{dtype}}] + Array of values of which unique will be calculated + na_sentinel : Py_ssize_t, default -1 + Sentinel value used for all NA-values in inverse + na_value : object, default None + Value to identify as missing. If na_value is None, then + any value "val" satisfying val != val is considered missing. + If na_value is not None, then _additionally_, any value "val" + satisfying val == na_value is considered missing. + + Returns + ------- + labels : ndarray[int64] + The labels from values to uniques + uniques : ndarray[{{dtype}}] + Unique values of input, not sorted + """ + # reduced signature compared to _factorize + # not necessary to have uniques-vector, count_prior + uniques = {{name}}Vector() + return self._factorize(values, uniques, 0, na_sentinel, na_value) + +{{py: +# tuples of "func_name, return_inverse, ignore_na" +unique_funcs = [('_unique_no_inverse', False, False), + ('_unique_with_inverse', True, False), + ('_factorize', True, True), + ('get_labels', True, True)] +}} + +{{for func_name, return_inverse, ignore_na in unique_funcs}} + @cython.boundscheck(False) @cython.wraparound(False) - def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, - bint ignore_na, bint return_uniques, - bint return_inverse, Py_ssize_t count_prior, - Py_ssize_t na_sentinel, object na_value): + def {{func_name}}(self, const {{dtype}}_t[:] values, + {{name}}Vector uniques, Py_ssize_t count_prior, + Py_ssize_t na_sentinel, object na_value): """ Calculate unique values and labels (no sorting!) +{{if func_name == '_factorize' or func_name == 'get_labels'}} + + Missing values are not included in the "uniques" for this method. + The labels for any missing values will be set to "na_sentinel" +{{endif}} Parameters ---------- @@ -370,17 +443,6 @@ cdef class {{name}}HashTable(HashTable): Array of values of which unique will be calculated uniques : {{name}}Vector Vector into which uniques will be written - ignore_na : boolean - Whether NA-values should be ignored for calculating the uniques. If - True, the labels corresponding to missing values will be set to - na_sentinel. - return_uniques : boolean - Whether to return the content of the passed "uniques" vector as an - np.ndarray at the end. If False, the vector passed to "uniques" - must be explicitly read and transformed by the user. - return_inverse : boolean - Whether the mapping of the original array values to their location - in the vector of uniques should be returned. count_prior : Py_ssize_t Number of existing entries in uniques na_sentinel : Py_ssize_t @@ -393,10 +455,23 @@ cdef class {{name}}HashTable(HashTable): Returns ------- - uniques : ndarray[{{dtype}}] (if return_uniques) +{{if func_name == '_unique_no_inverse'}} + uniques : ndarray[{{dtype}}] Unique values of input, not sorted - labels : ndarray[int64] (if return_inverse=True) +{{elif func_name == '_unique_with_inverse'}} + uniques : ndarray[{{dtype}}] + Unique values of input, not sorted + labels : ndarray[int64] + The labels from values to uniques +{{elif func_name == '_factorize' # switched output order for factorize}} + labels : ndarray[int64] The labels from values to uniques + uniques : ndarray[{{dtype}}] + Unique values of input, not sorted +{{elif func_name == 'get_labels'}} + labels : ndarray[int64] + The labels from values to uniques +{{endif}} """ cdef: Py_ssize_t i, idx, count = count_prior, n = len(values) @@ -407,7 +482,7 @@ cdef class {{name}}HashTable(HashTable): {{name}}VectorData *ud bint use_na_value - if return_inverse: + if {{return_inverse}}: labels = np.empty(n, dtype=np.int64) ud = uniques.data use_na_value = na_value is not None @@ -426,8 +501,8 @@ cdef class {{name}}HashTable(HashTable): for i in range(n): val = values[i] - if ignore_na and (val != val - or (use_na_value and val == na_value2)): + if {{ignore_na}} and (val != val + or (use_na_value and val == na_value2)): labels[i] = na_sentinel continue @@ -445,54 +520,27 @@ cdef class {{name}}HashTable(HashTable): "Vector.resize() needed") uniques.resize() append_data_{{dtype}}(ud, val) - if return_inverse: + if {{return_inverse}}: self.table.vals[k] = count labels[i] = count count += 1 - elif return_inverse: + elif {{return_inverse}}: # k falls into a previous bucket # only relevant in case we need to construct the inverse idx = self.table.vals[k] labels[i] = idx - if return_uniques and return_inverse: - return uniques.to_array(), np.asarray(labels) - elif return_uniques: - return uniques.to_array() - elif return_inverse: - return np.asarray(labels) - - def unique(self, const {{dtype}}_t[:] values, bint return_inverse): - uniques = {{name}}Vector() - return self._unique(values, uniques, - False, # ignore_na - True, # return_uniques - return_inverse, - # the rest are of the parameters are not relevant, - # but we don't use kwargs to avoid cython perf hit - 0, # count_prior - -1, # na_sentinel - None) # na_value +{{if func_name == '_unique_no_inverse'}} + return uniques.to_array() +{{elif func_name == '_unique_with_inverse'}} + return uniques.to_array(), np.asarray(labels) +{{elif func_name == '_factorize'}} + return np.asarray(labels), uniques.to_array() +{{elif func_name == 'get_labels'}} + return np.asarray(labels) +{{endif}} - def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel, - object na_value): - uniques = {{name}}Vector() - # factorize has reversed outputs compared to _unique (see "[::-1]") - return self._unique(values, uniques, - True, # ignore_na - True, # return_uniques - True, # return_inverse - 0, # count_prior - na_sentinel, na_value)[::-1] - - def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, - Py_ssize_t count_prior, Py_ssize_t na_sentinel, - object na_value): - return self._unique(values, uniques, - True, # ignore_na - False, # return_uniques - True, # return_inverse - count_prior, na_sentinel, na_value) +{{endfor}} @cython.boundscheck(False) def get_labels_groupby(self, const {{dtype}}_t[:] values): @@ -679,14 +727,87 @@ cdef class StringHashTable(HashTable): self.table.vals[k] = i free(vecs) + def unique(self, ndarray[object] values, bint return_inverse): + """ + Calculate unique values and labels (no sorting!) + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + return_inverse : boolean + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[int64] (if return_inverse) + The labels from values to uniques + """ + uniques = ObjectVector() + # explicitly compile path without inverse for performance + # the last three arguments are not relevant for this method, but we + # don't use kwargs to avoid cython perf hit (just using default values) + if return_inverse: + return self._unique_with_inverse(values, uniques, 0, -1, None) + return self._unique_no_inverse(values, uniques, 0, -1, None) + + def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel, + object na_value): + """ + Calculate unique values and labels (no sorting!) + + Missing values are not included in the "uniques" for this method. + The labels for any missing values will be set to "na_sentinel" + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + na_sentinel : Py_ssize_t, default -1 + Sentinel value used for all NA-values in inverse + na_value : object, default None + Value to identify as missing. If na_value is None, then any value + that is not a string is considered missing. If na_value is + not None, then _additionally_ any value "val" satisfying + val == na_value is considered missing. + + Returns + ------- + labels : ndarray[int64] + The labels from values to uniques + uniques : ndarray[object] + Unique values of input, not sorted + """ + # reduced signature compared to _factorize + # not necessary to have uniques-vector, count_prior + uniques = ObjectVector() + return self._factorize(values, uniques, 0, na_sentinel, na_value) + +{{py: +# tuples of "func_name, return_inverse, ignore_na" +unique_funcs = [('_unique_no_inverse', False, False), + ('_unique_with_inverse', True, False), + ('_factorize', True, True), + ('get_labels', True, True)] +}} + +{{for func_name, return_inverse, ignore_na in unique_funcs}} + @cython.boundscheck(False) @cython.wraparound(False) - def _unique(self, ndarray[object] values, ObjectVector uniques, - bint ignore_na, bint return_uniques, bint return_inverse, - Py_ssize_t count_prior, Py_ssize_t na_sentinel, - object na_value): + def {{func_name}}(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior, Py_ssize_t na_sentinel, + object na_value): """ - Calculate unique values and labels (no sorting) + Calculate unique values and labels (no sorting!) +{{if func_name == '_factorize' or func_name == 'get_labels'}} + + Missing values are not included in the "uniques" for this method. + The labels for any missing values will be set to "na_sentinel" +{{endif}} Parameters ---------- @@ -694,17 +815,6 @@ cdef class StringHashTable(HashTable): Array of values of which unique will be calculated uniques : ObjectVector Vector into which uniques will be written - ignore_na : boolean - Whether NA-values should be ignored for calculating the uniques. If - True, the labels corresponding to missing values will be set to - na_sentinel. - return_uniques : boolean - Whether to return the content of the passed "uniques" vector as an - np.ndarray at the end. If False, the vector passed to "uniques" - must be explicitly read and transformed by the user. - return_inverse : boolean - Whether the mapping of the original array values to their location - in the vector of uniques should be returned. count_prior : Py_ssize_t Number of existing entries in uniques na_sentinel : Py_ssize_t @@ -717,10 +827,23 @@ cdef class StringHashTable(HashTable): Returns ------- - uniques : ndarray[object] (if return_uniques) +{{if func_name == '_unique_no_inverse'}} + uniques : ndarray[object] Unique values of input, not sorted - labels : ndarray[int64] (if return_inverse) +{{elif func_name == '_unique_with_inverse'}} + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[int64] + The labels from values to uniques +{{elif func_name == '_factorize' # switched output order for factorize}} + labels : ndarray[int64] The labels from values to uniques + uniques : ndarray[object] + Unique values of input, not sorted +{{elif func_name == 'get_labels'}} + labels : ndarray[int64] + The labels from values to uniques +{{endif}} """ cdef: Py_ssize_t i, idx, count = count_prior, n = len(values) @@ -733,7 +856,7 @@ cdef class StringHashTable(HashTable): khiter_t k bint use_na_value - if return_inverse: + if {{return_inverse}}: labels = np.zeros(n, dtype=np.int64) uindexer = np.empty(n, dtype=np.int64) use_na_value = na_value is not None @@ -743,8 +866,9 @@ cdef class StringHashTable(HashTable): for i in range(n): val = values[i] - if ignore_na and (not (PyUnicode_Check(val) or PyString_Check(val)) - or (use_na_value and val == na_value)): + if ({{ignore_na}} + and (not (PyUnicode_Check(val) or PyString_Check(val)) + or (use_na_value and val == na_value))): # missing value labels[i] = na_sentinel else: @@ -755,7 +879,7 @@ cdef class StringHashTable(HashTable): # compute with nogil: for i in range(n): - if ignore_na and labels[i] == na_sentinel: + if {{ignore_na}} and labels[i] == na_sentinel: continue v = vecs[i] @@ -764,11 +888,11 @@ cdef class StringHashTable(HashTable): # k hasn't been seen yet k = kh_put_str(self.table, v, &ret) uindexer[count] = i - if return_inverse: + if {{return_inverse}}: self.table.vals[k] = count labels[i] = count count += 1 - elif return_inverse: + elif {{return_inverse}}: # k falls into a previous bucket # only relevant in case we need to construct the inverse idx = self.table.vals[k] @@ -780,44 +904,17 @@ cdef class StringHashTable(HashTable): for i in range(count): uniques.append(values[uindexer[i]]) - if return_uniques and return_inverse: - return uniques.to_array(), np.asarray(labels) - elif return_uniques: - return uniques.to_array() - elif return_inverse: - return np.asarray(labels) - - def unique(self, ndarray[object] values, bint return_inverse=False): - uniques = ObjectVector() - return self._unique(values, uniques, - False, # ignore_na - True, # return_uniques - return_inverse, - # the rest are of the parameters are not relevant, - # but we don't use kwargs to avoid cython perf hit - 0, # count_prior - -1, # na_sentinel - None) # na_value +{{if func_name == '_unique_no_inverse'}} + return uniques.to_array() +{{elif func_name == '_unique_with_inverse'}} + return uniques.to_array(), np.asarray(labels) +{{elif func_name == '_factorize'}} + return np.asarray(labels), uniques.to_array() +{{elif func_name == 'get_labels'}} + return np.asarray(labels) +{{endif}} - def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel, - object na_value): - uniques = ObjectVector() - # factorize has reversed outputs compared to _unique (see "[::-1]") - return self._unique(values, uniques, - True, # ignore_na - True, # return_uniques - True, # return_inverse - 0, # count_prior - na_sentinel, na_value)[::-1] - - def get_labels(self, ndarray[object] values, ObjectVector uniques, - Py_ssize_t count_prior, Py_ssize_t na_sentinel, - object na_value): - return self._unique(values, uniques, - True, # ignore_na - False, # return_uniques - True, # return_inverse - count_prior, na_sentinel, na_value) +{{endfor}} cdef class PyObjectHashTable(HashTable): @@ -905,14 +1002,87 @@ cdef class PyObjectHashTable(HashTable): return np.asarray(locs) + def unique(self, ndarray[object] values, bint return_inverse): + """ + Calculate unique values and labels (no sorting!) + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + return_inverse : boolean + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[int64] (if return_inverse) + The labels from values to uniques + """ + uniques = ObjectVector() + # explicitly compile path without inverse for performance + # the last three arguments are not relevant for this method, but we + # don't use kwargs to avoid cython perf hit (just using default values) + if return_inverse: + return self._unique_with_inverse(values, uniques, 0, -1, None) + return self._unique_no_inverse(values, uniques, 0, -1, None) + + def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel, + object na_value): + """ + Calculate unique values and labels (no sorting!) + + Missing values are not included in the "uniques" for this method. + The labels for any missing values will be set to "na_sentinel" + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + na_sentinel : Py_ssize_t, default -1 + Sentinel value used for all NA-values in inverse + na_value : object, default None + Value to identify as missing. If na_value is None, then None _plus_ + any value "val" satisfying val != val is considered missing. + If na_value is not None, then _additionally_, any value "val" + satisfying val == na_value is considered missing. + + Returns + ------- + labels : ndarray[int64] + The labels from values to uniques + uniques : ndarray[object] + Unique values of input, not sorted + """ + # reduced signature compared to _factorize + # not necessary to have uniques-vector, count_prior + uniques = ObjectVector() + return self._factorize(values, uniques, 0, na_sentinel, na_value) + +{{py: +# tuples of "func_name, return_inverse, ignore_na" +unique_funcs = [('_unique_no_inverse', False, False), + ('_unique_with_inverse', True, False), + ('_factorize', True, True), + ('get_labels', True, True)] +}} + +{{for func_name, return_inverse, ignore_na in unique_funcs}} + @cython.boundscheck(False) @cython.wraparound(False) - def _unique(self, ndarray[object] values, ObjectVector uniques, - bint ignore_na, bint return_uniques, bint return_inverse, - Py_ssize_t count_prior, Py_ssize_t na_sentinel, - object na_value): + def {{func_name}}(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior, Py_ssize_t na_sentinel, + object na_value): """ - Calculate unique values and labels (no sorting) + Calculate unique values and labels (no sorting!) +{{if func_name == '_factorize' or func_name == 'get_labels'}} + + Missing values are not included in the "uniques" for this method. + The labels for any missing values will be set to "na_sentinel" +{{endif}} Parameters ---------- @@ -920,17 +1090,6 @@ cdef class PyObjectHashTable(HashTable): Array of values of which unique will be calculated uniques : ObjectVector Vector into which uniques will be written - ignore_na : boolean - Whether NA-values should be ignored for calculating the uniques. If - True, the labels corresponding to missing values will be set to - na_sentinel. - return_uniques : boolean - Whether to return the content of the passed "uniques" vector as an - np.ndarray at the end. If False, the vector passed to "uniques" - must be explicitly read and transformed by the user. - return_inverse : boolean - Whether the mapping of the original array values to their location - in the vector of uniques should be returned. count_prior : Py_ssize_t Number of existing entries in uniques na_sentinel : Py_ssize_t @@ -943,10 +1102,23 @@ cdef class PyObjectHashTable(HashTable): Returns ------- - uniques : ndarray[object] (if return_uniques) +{{if func_name == '_unique_no_inverse'}} + uniques : ndarray[object] Unique values of input, not sorted - labels : ndarray[int64] (if return_inverse) +{{elif func_name == '_unique_with_inverse'}} + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[int64] The labels from values to uniques +{{elif func_name == '_factorize' # switched output order for factorize}} + labels : ndarray[int64] + The labels from values to uniques + uniques : ndarray[object] + Unique values of input, not sorted +{{elif func_name == 'get_labels'}} + labels : ndarray[int64] + The labels from values to uniques +{{endif}} """ cdef: Py_ssize_t i, idx, count = count_prior, n = len(values) @@ -956,7 +1128,7 @@ cdef class PyObjectHashTable(HashTable): khiter_t k bint use_na_value - if return_inverse: + if {{return_inverse}}: labels = np.empty(n, dtype=np.int64) use_na_value = na_value is not None @@ -964,8 +1136,8 @@ cdef class PyObjectHashTable(HashTable): val = values[i] hash(val) - if ignore_na and ((val != val or val is None) - or (use_na_value and val == na_value)): + if {{ignore_na}} and ((val != val or val is None) + or (use_na_value and val == na_value)): labels[i] = na_sentinel continue @@ -974,51 +1146,24 @@ cdef class PyObjectHashTable(HashTable): # k hasn't been seen yet k = kh_put_pymap(self.table, val, &ret) uniques.append(val) - if return_inverse: + if {{return_inverse}}: self.table.vals[k] = count labels[i] = count count += 1 - elif return_inverse: + elif {{return_inverse}}: # k falls into a previous bucket # only relevant in case we need to construct the inverse idx = self.table.vals[k] labels[i] = idx - if return_uniques and return_inverse: - return uniques.to_array(), np.asarray(labels) - elif return_uniques: - return uniques.to_array() - elif return_inverse: - return np.asarray(labels) - - def unique(self, ndarray[object] values, bint return_inverse=False): - uniques = ObjectVector() - return self._unique(values, uniques, - False, # ignore_na - True, # return_uniques - return_inverse, - # the rest are of the parameters are not relevant, - # but we don't use kwargs to avoid cython perf hit - 0, # count_prior - -1, # na_sentinel - None) # na_value +{{if func_name == '_unique_no_inverse'}} + return uniques.to_array() +{{elif func_name == '_unique_with_inverse'}} + return uniques.to_array(), np.asarray(labels) +{{elif func_name == '_factorize'}} + return np.asarray(labels), uniques.to_array() +{{elif func_name == 'get_labels'}} + return np.asarray(labels) +{{endif}} - def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel, - object na_value): - uniques = ObjectVector() - # factorize has reversed outputs compared to _unique (see "[::-1]") - return self._unique(values, uniques, - True, # ignore_na - True, # return_uniques - True, # return_inverse - 0, # count_prior - na_sentinel, na_value)[::-1] - - def get_labels(self, ndarray[object] values, ObjectVector uniques, - Py_ssize_t count_prior, Py_ssize_t na_sentinel, - object na_value): - return self._unique(values, uniques, - True, # ignore_na - False, # return_uniques - True, # return_inverse - count_prior, na_sentinel, na_value) +{{endfor}} From 906cd50e8391148a19fb93487b7fedb25ab9e767 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 26 Oct 2018 19:18:55 +0200 Subject: [PATCH 03/17] Re-add kwargs to method signature --- pandas/_libs/hashtable.pyx | 3 +- pandas/_libs/hashtable_class_helper.pxi.in | 70 ++++++++++------------ pandas/core/algorithms.py | 5 +- pandas/tests/test_algos.py | 17 ++---- 4 files changed, 43 insertions(+), 52 deletions(-) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 1d5f637a05188..2ced98198afc6 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -124,7 +124,8 @@ cdef class Int64Factorizer: uniques.extend(self.uniques.to_array()) self.uniques = uniques labels = self.table.get_labels(values, self.uniques, - self.count, na_sentinel, na_value) + self.count, na_sentinel, + na_value=na_value) # sort on if sort: diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 6b50c65e29a4b..260c9bab91e93 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -355,7 +355,7 @@ cdef class {{name}}HashTable(HashTable): return np.asarray(locs) - def unique(self, const {{dtype}}_t[:] values, bint return_inverse): + def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False): """ Calculate unique values and labels (no sorting!) @@ -363,7 +363,7 @@ cdef class {{name}}HashTable(HashTable): ---------- values : ndarray[{{dtype}}] Array of values of which unique will be calculated - return_inverse : boolean + return_inverse : boolean, default False Whether the mapping of the original array values to their location in the vector of uniques should be returned. @@ -376,14 +376,12 @@ cdef class {{name}}HashTable(HashTable): """ uniques = {{name}}Vector() # explicitly compile path without inverse for performance - # the last three arguments are not relevant for this method, but we - # don't use kwargs to avoid cython perf hit (just using default values) if return_inverse: - return self._unique_with_inverse(values, uniques, 0, -1, None) - return self._unique_no_inverse(values, uniques, 0, -1, None) + return self._unique_with_inverse(values, uniques) + return self._unique_no_inverse(values, uniques) - def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel, - object na_value): + def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1, + object na_value=None): """ Calculate unique values and labels (no sorting!) @@ -427,8 +425,8 @@ unique_funcs = [('_unique_no_inverse', False, False), @cython.boundscheck(False) @cython.wraparound(False) def {{func_name}}(self, const {{dtype}}_t[:] values, - {{name}}Vector uniques, Py_ssize_t count_prior, - Py_ssize_t na_sentinel, object na_value): + {{name}}Vector uniques, Py_ssize_t count_prior=0, + Py_ssize_t na_sentinel=-1, object na_value=None): """ Calculate unique values and labels (no sorting!) {{if func_name == '_factorize' or func_name == 'get_labels'}} @@ -443,11 +441,11 @@ unique_funcs = [('_unique_no_inverse', False, False), Array of values of which unique will be calculated uniques : {{name}}Vector Vector into which uniques will be written - count_prior : Py_ssize_t + count_prior : Py_ssize_t, default 0 Number of existing entries in uniques - na_sentinel : Py_ssize_t + na_sentinel : Py_ssize_t, default -1 Sentinel value used for all NA-values in inverse - na_value : object + na_value : object, default None Value to identify as missing. If na_value is None, then any value "val" satisfying val != val is considered missing. If na_value is not None, then _additionally_, any value "val" @@ -727,7 +725,7 @@ cdef class StringHashTable(HashTable): self.table.vals[k] = i free(vecs) - def unique(self, ndarray[object] values, bint return_inverse): + def unique(self, ndarray[object] values, bint return_inverse=False): """ Calculate unique values and labels (no sorting!) @@ -735,7 +733,7 @@ cdef class StringHashTable(HashTable): ---------- values : ndarray[object] Array of values of which unique will be calculated - return_inverse : boolean + return_inverse : boolean, default False Whether the mapping of the original array values to their location in the vector of uniques should be returned. @@ -748,14 +746,12 @@ cdef class StringHashTable(HashTable): """ uniques = ObjectVector() # explicitly compile path without inverse for performance - # the last three arguments are not relevant for this method, but we - # don't use kwargs to avoid cython perf hit (just using default values) if return_inverse: - return self._unique_with_inverse(values, uniques, 0, -1, None) - return self._unique_no_inverse(values, uniques, 0, -1, None) + return self._unique_with_inverse(values, uniques) + return self._unique_no_inverse(values, uniques) - def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel, - object na_value): + def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, + object na_value=None): """ Calculate unique values and labels (no sorting!) @@ -799,8 +795,8 @@ unique_funcs = [('_unique_no_inverse', False, False), @cython.boundscheck(False) @cython.wraparound(False) def {{func_name}}(self, ndarray[object] values, ObjectVector uniques, - Py_ssize_t count_prior, Py_ssize_t na_sentinel, - object na_value): + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None): """ Calculate unique values and labels (no sorting!) {{if func_name == '_factorize' or func_name == 'get_labels'}} @@ -815,11 +811,11 @@ unique_funcs = [('_unique_no_inverse', False, False), Array of values of which unique will be calculated uniques : ObjectVector Vector into which uniques will be written - count_prior : Py_ssize_t + count_prior : Py_ssize_t, default 0 Number of existing entries in uniques - na_sentinel : Py_ssize_t + na_sentinel : Py_ssize_t, default -1 Sentinel value used for all NA-values in inverse - na_value : object + na_value : object, default None Value to identify as missing. If na_value is None, then any value that is not a string is considered missing. If na_value is not None, then _additionally_ any value "val" satisfying @@ -1002,7 +998,7 @@ cdef class PyObjectHashTable(HashTable): return np.asarray(locs) - def unique(self, ndarray[object] values, bint return_inverse): + def unique(self, ndarray[object] values, bint return_inverse=False): """ Calculate unique values and labels (no sorting!) @@ -1010,7 +1006,7 @@ cdef class PyObjectHashTable(HashTable): ---------- values : ndarray[object] Array of values of which unique will be calculated - return_inverse : boolean + return_inverse : boolean, default False Whether the mapping of the original array values to their location in the vector of uniques should be returned. @@ -1023,14 +1019,12 @@ cdef class PyObjectHashTable(HashTable): """ uniques = ObjectVector() # explicitly compile path without inverse for performance - # the last three arguments are not relevant for this method, but we - # don't use kwargs to avoid cython perf hit (just using default values) if return_inverse: - return self._unique_with_inverse(values, uniques, 0, -1, None) - return self._unique_no_inverse(values, uniques, 0, -1, None) + return self._unique_with_inverse(values, uniques) + return self._unique_no_inverse(values, uniques) - def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel, - object na_value): + def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, + object na_value=None): """ Calculate unique values and labels (no sorting!) @@ -1074,8 +1068,8 @@ unique_funcs = [('_unique_no_inverse', False, False), @cython.boundscheck(False) @cython.wraparound(False) def {{func_name}}(self, ndarray[object] values, ObjectVector uniques, - Py_ssize_t count_prior, Py_ssize_t na_sentinel, - object na_value): + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None): """ Calculate unique values and labels (no sorting!) {{if func_name == '_factorize' or func_name == 'get_labels'}} @@ -1090,9 +1084,9 @@ unique_funcs = [('_unique_no_inverse', False, False), Array of values of which unique will be calculated uniques : ObjectVector Vector into which uniques will be written - count_prior : Py_ssize_t + count_prior : Py_ssize_t, default 0 Number of existing entries in uniques - na_sentinel : Py_ssize_t + na_sentinel : Py_ssize_t, default -1 Sentinel value used for all NA-values in inverse na_value : object Value to identify as missing. If na_value is None, then None _plus_ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 42cff6c431e37..df2da26685a16 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -365,7 +365,7 @@ def unique(values): htable, _, values, dtype, ndtype = _get_hashtable_algo(values) table = htable(len(values)) - uniques = table.unique(values, False) + uniques = table.unique(values) uniques = _reconstruct_data(uniques, dtype, original) if isinstance(original, ABCSeries) and is_datetime64tz_dtype(dtype): @@ -470,7 +470,8 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None, (hash_klass, _), values = _get_data_algo(values, _hashtables) table = hash_klass(size_hint or len(values)) - labels, uniques = table.factorize(values, na_sentinel, na_value) + labels, uniques = table.factorize(values, na_sentinel=na_sentinel, + na_value=na_value) labels = ensure_platform_int(labels) return labels, uniques diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index cf0273826aca3..cd4abc7a67526 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1318,7 +1318,7 @@ def test_vector_resize(self, writable, htable, uniques, dtype, uniques = uniques() # get_labels may append to uniques - htable.get_labels(vals[:nvals], uniques, 0, -1, None) + htable.get_labels(vals[:nvals], uniques, 0, -1) # to_array() sets an external_view_exists flag on uniques. tmp = uniques.to_array() oldshape = tmp.shape @@ -1326,10 +1326,10 @@ def test_vector_resize(self, writable, htable, uniques, dtype, # subsequent get_labels() calls can no longer append to it # (except for StringHashTables + ObjectVector) if safely_resizes: - htable.get_labels(vals, uniques, 0, -1, None) + htable.get_labels(vals, uniques, 0, -1) else: with tm.assert_raises_regex(ValueError, 'external reference.*'): - htable.get_labels(vals, uniques, 0, -1, None) + htable.get_labels(vals, uniques, 0, -1) uniques.to_array() # should not raise here assert tmp.shape == oldshape @@ -1358,14 +1358,12 @@ def test_hashtable_unique(self, htable, tm_dtype, writable): # drop_duplicates has own cython code (hash_table_func_helper.pxi) # and is tested separately; keeps first occurrence like ht.unique() expected_unique = s_duplicated.drop_duplicates(keep='first').values - return_inverse = False - result_unique = htable().unique(s_duplicated.values, return_inverse) + result_unique = htable().unique(s_duplicated.values) tm.assert_numpy_array_equal(result_unique, expected_unique) # test with inverse - return_inverse = True result_unique, result_inverse = htable().unique(s_duplicated.values, - return_inverse) + return_inverse=True) tm.assert_numpy_array_equal(result_unique, expected_unique) reconstr = result_unique[result_inverse] tm.assert_numpy_array_equal(reconstr, s_duplicated.values) @@ -1392,10 +1390,7 @@ def test_hashtable_factorize(self, htable, tm_dtype, writable): s_duplicated.values.setflags(write=writable) na_mask = s_duplicated.isna().values - na_sentinel = -1 - na_value = None - result = htable().factorize(s_duplicated.values, na_sentinel, na_value) - result_inverse, result_unique = result + result_inverse, result_unique = htable().factorize(s_duplicated.values) # drop_duplicates has own cython code (hash_table_func_helper.pxi) # and is tested separately; keeps first occurrence like ht.factorize() From 19c7c1f8d240c508b212db6b8d940620164d1566 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 28 Oct 2018 19:11:27 +0100 Subject: [PATCH 04/17] Fix small oversight --- pandas/_libs/hashtable_class_helper.pxi.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 260c9bab91e93..f59a287a9dc3b 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -1088,7 +1088,7 @@ unique_funcs = [('_unique_no_inverse', False, False), Number of existing entries in uniques na_sentinel : Py_ssize_t, default -1 Sentinel value used for all NA-values in inverse - na_value : object + na_value : object, default None Value to identify as missing. If na_value is None, then None _plus_ any value "val" satisfying val != val is considered missing. If na_value is not None, then _additionally_, any value "val" From a8f079f32a2d914bd432520e49c27f740f0bd14a Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 29 Oct 2018 07:47:01 +0100 Subject: [PATCH 05/17] Simplify an if-condition --- pandas/_libs/hashtable_class_helper.pxi.in | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index f59a287a9dc3b..789a3820ddb09 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -429,7 +429,7 @@ unique_funcs = [('_unique_no_inverse', False, False), Py_ssize_t na_sentinel=-1, object na_value=None): """ Calculate unique values and labels (no sorting!) -{{if func_name == '_factorize' or func_name == 'get_labels'}} +{{if ignore_na}} Missing values are not included in the "uniques" for this method. The labels for any missing values will be set to "na_sentinel" @@ -799,7 +799,7 @@ unique_funcs = [('_unique_no_inverse', False, False), object na_value=None): """ Calculate unique values and labels (no sorting!) -{{if func_name == '_factorize' or func_name == 'get_labels'}} +{{if ignore_na}} Missing values are not included in the "uniques" for this method. The labels for any missing values will be set to "na_sentinel" @@ -1072,7 +1072,7 @@ unique_funcs = [('_unique_no_inverse', False, False), object na_value=None): """ Calculate unique values and labels (no sorting!) -{{if func_name == '_factorize' or func_name == 'get_labels'}} +{{if ignore_na}} Missing values are not included in the "uniques" for this method. The labels for any missing values will be set to "na_sentinel" From 1c5b97ae73e6b4255d9bae49c2f713ea22564e72 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 29 Oct 2018 07:50:36 +0100 Subject: [PATCH 06/17] Reword comment --- pandas/tests/test_algos.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index cd4abc7a67526..27c51983adef9 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1361,7 +1361,8 @@ def test_hashtable_unique(self, htable, tm_dtype, writable): result_unique = htable().unique(s_duplicated.values) tm.assert_numpy_array_equal(result_unique, expected_unique) - # test with inverse + # test return_inverse=True + # reconstruction can only succeed if the inverse is correct result_unique, result_inverse = htable().unique(s_duplicated.values, return_inverse=True) tm.assert_numpy_array_equal(result_unique, expected_unique) From c7327fd0d80240cef76cfe019266c00a105befdc Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Wed, 31 Oct 2018 19:19:45 +0100 Subject: [PATCH 07/17] Revert templating over {return_inverse, ignore_na} --- pandas/_libs/hashtable_class_helper.pxi.in | 555 +++++++++------------ 1 file changed, 246 insertions(+), 309 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 789a3820ddb09..0d9b56880eb34 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -355,85 +355,14 @@ cdef class {{name}}HashTable(HashTable): return np.asarray(locs) - def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False): - """ - Calculate unique values and labels (no sorting!) - - Parameters - ---------- - values : ndarray[{{dtype}}] - Array of values of which unique will be calculated - return_inverse : boolean, default False - Whether the mapping of the original array values to their location - in the vector of uniques should be returned. - - Returns - ------- - uniques : ndarray[{{dtype}}] - Unique values of input, not sorted - labels : ndarray[int64] (if return_inverse) - The labels from values to uniques - """ - uniques = {{name}}Vector() - # explicitly compile path without inverse for performance - if return_inverse: - return self._unique_with_inverse(values, uniques) - return self._unique_no_inverse(values, uniques) - - def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1, - object na_value=None): - """ - Calculate unique values and labels (no sorting!) - - Missing values are not included in the "uniques" for this method. - The labels for any missing values will be set to "na_sentinel" - - Parameters - ---------- - values : ndarray[{{dtype}}] - Array of values of which unique will be calculated - na_sentinel : Py_ssize_t, default -1 - Sentinel value used for all NA-values in inverse - na_value : object, default None - Value to identify as missing. If na_value is None, then - any value "val" satisfying val != val is considered missing. - If na_value is not None, then _additionally_, any value "val" - satisfying val == na_value is considered missing. - - Returns - ------- - labels : ndarray[int64] - The labels from values to uniques - uniques : ndarray[{{dtype}}] - Unique values of input, not sorted - """ - # reduced signature compared to _factorize - # not necessary to have uniques-vector, count_prior - uniques = {{name}}Vector() - return self._factorize(values, uniques, 0, na_sentinel, na_value) - -{{py: -# tuples of "func_name, return_inverse, ignore_na" -unique_funcs = [('_unique_no_inverse', False, False), - ('_unique_with_inverse', True, False), - ('_factorize', True, True), - ('get_labels', True, True)] -}} - -{{for func_name, return_inverse, ignore_na in unique_funcs}} - @cython.boundscheck(False) @cython.wraparound(False) - def {{func_name}}(self, const {{dtype}}_t[:] values, - {{name}}Vector uniques, Py_ssize_t count_prior=0, - Py_ssize_t na_sentinel=-1, object na_value=None): + def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, + bint ignore_na=False, bint return_inverse=False, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None): """ Calculate unique values and labels (no sorting!) -{{if ignore_na}} - - Missing values are not included in the "uniques" for this method. - The labels for any missing values will be set to "na_sentinel" -{{endif}} Parameters ---------- @@ -441,6 +370,13 @@ unique_funcs = [('_unique_no_inverse', False, False), Array of values of which unique will be calculated uniques : {{name}}Vector Vector into which uniques will be written + ignore_na : boolean, default False + Whether NA-values should be ignored for calculating the uniques. If + True, the labels corresponding to missing values will be set to + na_sentinel. + return_inverse : boolean, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. count_prior : Py_ssize_t, default 0 Number of existing entries in uniques na_sentinel : Py_ssize_t, default -1 @@ -453,23 +389,10 @@ unique_funcs = [('_unique_no_inverse', False, False), Returns ------- -{{if func_name == '_unique_no_inverse'}} - uniques : ndarray[{{dtype}}] - Unique values of input, not sorted -{{elif func_name == '_unique_with_inverse'}} - uniques : ndarray[{{dtype}}] - Unique values of input, not sorted - labels : ndarray[int64] - The labels from values to uniques -{{elif func_name == '_factorize' # switched output order for factorize}} - labels : ndarray[int64] - The labels from values to uniques uniques : ndarray[{{dtype}}] Unique values of input, not sorted -{{elif func_name == 'get_labels'}} - labels : ndarray[int64] + labels : ndarray[int64] (if return_inverse=True) The labels from values to uniques -{{endif}} """ cdef: Py_ssize_t i, idx, count = count_prior, n = len(values) @@ -480,7 +403,7 @@ unique_funcs = [('_unique_no_inverse', False, False), {{name}}VectorData *ud bint use_na_value - if {{return_inverse}}: + if return_inverse: labels = np.empty(n, dtype=np.int64) ud = uniques.data use_na_value = na_value is not None @@ -499,8 +422,8 @@ unique_funcs = [('_unique_no_inverse', False, False), for i in range(n): val = values[i] - if {{ignore_na}} and (val != val - or (use_na_value and val == na_value2)): + if ignore_na and (val != val + or (use_na_value and val == na_value2)): labels[i] = na_sentinel continue @@ -518,27 +441,83 @@ unique_funcs = [('_unique_no_inverse', False, False), "Vector.resize() needed") uniques.resize() append_data_{{dtype}}(ud, val) - if {{return_inverse}}: + if return_inverse: self.table.vals[k] = count labels[i] = count count += 1 - elif {{return_inverse}}: + elif return_inverse: # k falls into a previous bucket # only relevant in case we need to construct the inverse idx = self.table.vals[k] labels[i] = idx -{{if func_name == '_unique_no_inverse'}} + if return_inverse: + return uniques.to_array(), np.asarray(labels) return uniques.to_array() -{{elif func_name == '_unique_with_inverse'}} - return uniques.to_array(), np.asarray(labels) -{{elif func_name == '_factorize'}} - return np.asarray(labels), uniques.to_array() -{{elif func_name == 'get_labels'}} - return np.asarray(labels) -{{endif}} -{{endfor}} + def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False): + """ + Calculate unique values and labels (no sorting!) + + Parameters + ---------- + values : ndarray[{{dtype}}] + Array of values of which unique will be calculated + return_inverse : boolean, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. + + Returns + ------- + uniques : ndarray[{{dtype}}] + Unique values of input, not sorted + labels : ndarray[int64] (if return_inverse) + The labels from values to uniques + """ + uniques = {{name}}Vector() + return self._unique(values, uniques, ignore_na=False, + return_inverse=return_inverse) + + def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1, + object na_value=None): + """ + Calculate unique values and labels (no sorting!) + + Missing values are not included in the "uniques" for this method. + The labels for any missing values will be set to "na_sentinel" + + Parameters + ---------- + values : ndarray[{{dtype}}] + Array of values of which unique will be calculated + na_sentinel : Py_ssize_t, default -1 + Sentinel value used for all NA-values in inverse + na_value : object, default None + Value to identify as missing. If na_value is None, then + any value "val" satisfying val != val is considered missing. + If na_value is not None, then _additionally_, any value "val" + satisfying val == na_value is considered missing. + + Returns + ------- + labels : ndarray[int64] + The labels from values to uniques + uniques : ndarray[{{dtype}}] + Unique values of input, not sorted + """ + uniques = {{name}}Vector() + # factorize has reversed outputs compared to _unique (see "[::-1]") + return self._unique(values, uniques, ignore_na=True, + return_inverse=True, na_sentinel=na_sentinel, + na_value=na_value)[::-1] + + def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None): + _, labels = self._unique(values, uniques, ignore_na=True, + return_inverse=True, count_prior=count_prior, + na_sentinel=na_sentinel, na_value=na_value) + return labels @cython.boundscheck(False) def get_labels_groupby(self, const {{dtype}}_t[:] values): @@ -725,85 +704,14 @@ cdef class StringHashTable(HashTable): self.table.vals[k] = i free(vecs) - def unique(self, ndarray[object] values, bint return_inverse=False): - """ - Calculate unique values and labels (no sorting!) - - Parameters - ---------- - values : ndarray[object] - Array of values of which unique will be calculated - return_inverse : boolean, default False - Whether the mapping of the original array values to their location - in the vector of uniques should be returned. - - Returns - ------- - uniques : ndarray[object] - Unique values of input, not sorted - labels : ndarray[int64] (if return_inverse) - The labels from values to uniques - """ - uniques = ObjectVector() - # explicitly compile path without inverse for performance - if return_inverse: - return self._unique_with_inverse(values, uniques) - return self._unique_no_inverse(values, uniques) - - def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, - object na_value=None): - """ - Calculate unique values and labels (no sorting!) - - Missing values are not included in the "uniques" for this method. - The labels for any missing values will be set to "na_sentinel" - - Parameters - ---------- - values : ndarray[object] - Array of values of which unique will be calculated - na_sentinel : Py_ssize_t, default -1 - Sentinel value used for all NA-values in inverse - na_value : object, default None - Value to identify as missing. If na_value is None, then any value - that is not a string is considered missing. If na_value is - not None, then _additionally_ any value "val" satisfying - val == na_value is considered missing. - - Returns - ------- - labels : ndarray[int64] - The labels from values to uniques - uniques : ndarray[object] - Unique values of input, not sorted - """ - # reduced signature compared to _factorize - # not necessary to have uniques-vector, count_prior - uniques = ObjectVector() - return self._factorize(values, uniques, 0, na_sentinel, na_value) - -{{py: -# tuples of "func_name, return_inverse, ignore_na" -unique_funcs = [('_unique_no_inverse', False, False), - ('_unique_with_inverse', True, False), - ('_factorize', True, True), - ('get_labels', True, True)] -}} - -{{for func_name, return_inverse, ignore_na in unique_funcs}} - @cython.boundscheck(False) @cython.wraparound(False) - def {{func_name}}(self, ndarray[object] values, ObjectVector uniques, - Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None): + def _unique(self, ndarray[object] values, ObjectVector uniques, + bint ignore_na=False, bint return_inverse=False, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None): """ Calculate unique values and labels (no sorting!) -{{if ignore_na}} - - Missing values are not included in the "uniques" for this method. - The labels for any missing values will be set to "na_sentinel" -{{endif}} Parameters ---------- @@ -811,6 +719,13 @@ unique_funcs = [('_unique_no_inverse', False, False), Array of values of which unique will be calculated uniques : ObjectVector Vector into which uniques will be written + ignore_na : boolean, default False + Whether NA-values should be ignored for calculating the uniques. If + True, the labels corresponding to missing values will be set to + na_sentinel. + return_inverse : boolean, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. count_prior : Py_ssize_t, default 0 Number of existing entries in uniques na_sentinel : Py_ssize_t, default -1 @@ -823,23 +738,10 @@ unique_funcs = [('_unique_no_inverse', False, False), Returns ------- -{{if func_name == '_unique_no_inverse'}} - uniques : ndarray[object] - Unique values of input, not sorted -{{elif func_name == '_unique_with_inverse'}} - uniques : ndarray[object] - Unique values of input, not sorted - labels : ndarray[int64] - The labels from values to uniques -{{elif func_name == '_factorize' # switched output order for factorize}} - labels : ndarray[int64] - The labels from values to uniques uniques : ndarray[object] Unique values of input, not sorted -{{elif func_name == 'get_labels'}} - labels : ndarray[int64] + labels : ndarray[int64] (if return_inverse=True) The labels from values to uniques -{{endif}} """ cdef: Py_ssize_t i, idx, count = count_prior, n = len(values) @@ -852,7 +754,7 @@ unique_funcs = [('_unique_no_inverse', False, False), khiter_t k bint use_na_value - if {{return_inverse}}: + if return_inverse: labels = np.zeros(n, dtype=np.int64) uindexer = np.empty(n, dtype=np.int64) use_na_value = na_value is not None @@ -862,7 +764,7 @@ unique_funcs = [('_unique_no_inverse', False, False), for i in range(n): val = values[i] - if ({{ignore_na}} + if (ignore_na and (not (PyUnicode_Check(val) or PyString_Check(val)) or (use_na_value and val == na_value))): # missing value @@ -875,7 +777,7 @@ unique_funcs = [('_unique_no_inverse', False, False), # compute with nogil: for i in range(n): - if {{ignore_na}} and labels[i] == na_sentinel: + if ignore_na and labels[i] == na_sentinel: continue v = vecs[i] @@ -884,11 +786,11 @@ unique_funcs = [('_unique_no_inverse', False, False), # k hasn't been seen yet k = kh_put_str(self.table, v, &ret) uindexer[count] = i - if {{return_inverse}}: + if return_inverse: self.table.vals[k] = count labels[i] = count count += 1 - elif {{return_inverse}}: + elif return_inverse: # k falls into a previous bucket # only relevant in case we need to construct the inverse idx = self.table.vals[k] @@ -900,17 +802,73 @@ unique_funcs = [('_unique_no_inverse', False, False), for i in range(count): uniques.append(values[uindexer[i]]) -{{if func_name == '_unique_no_inverse'}} + if return_inverse: + return uniques.to_array(), np.asarray(labels) return uniques.to_array() -{{elif func_name == '_unique_with_inverse'}} - return uniques.to_array(), np.asarray(labels) -{{elif func_name == '_factorize'}} - return np.asarray(labels), uniques.to_array() -{{elif func_name == 'get_labels'}} - return np.asarray(labels) -{{endif}} -{{endfor}} + def unique(self, ndarray[object] values, bint return_inverse=False): + """ + Calculate unique values and labels (no sorting!) + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + return_inverse : boolean, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[int64] (if return_inverse) + The labels from values to uniques + """ + uniques = ObjectVector() + return self._unique(values, uniques, ignore_na=False, + return_inverse=return_inverse) + + def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, + object na_value=None): + """ + Calculate unique values and labels (no sorting!) + + Missing values are not included in the "uniques" for this method. + The labels for any missing values will be set to "na_sentinel" + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + na_sentinel : Py_ssize_t, default -1 + Sentinel value used for all NA-values in inverse + na_value : object, default None + Value to identify as missing. If na_value is None, then any value + that is not a string is considered missing. If na_value is + not None, then _additionally_ any value "val" satisfying + val == na_value is considered missing. + + Returns + ------- + labels : ndarray[int64] + The labels from values to uniques + uniques : ndarray[object] + Unique values of input, not sorted + """ + uniques = ObjectVector() + # factorize has reversed outputs compared to _unique (see "[::-1]") + return self._unique(values, uniques, ignore_na=True, + return_inverse=True, na_sentinel=na_sentinel, + na_value=na_value)[::-1] + + def get_labels(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None): + _, labels = self._unique(values, uniques, ignore_na=True, + return_inverse=True, count_prior=count_prior, + na_sentinel=na_sentinel, na_value=na_value) + return labels cdef class PyObjectHashTable(HashTable): @@ -998,85 +956,14 @@ cdef class PyObjectHashTable(HashTable): return np.asarray(locs) - def unique(self, ndarray[object] values, bint return_inverse=False): - """ - Calculate unique values and labels (no sorting!) - - Parameters - ---------- - values : ndarray[object] - Array of values of which unique will be calculated - return_inverse : boolean, default False - Whether the mapping of the original array values to their location - in the vector of uniques should be returned. - - Returns - ------- - uniques : ndarray[object] - Unique values of input, not sorted - labels : ndarray[int64] (if return_inverse) - The labels from values to uniques - """ - uniques = ObjectVector() - # explicitly compile path without inverse for performance - if return_inverse: - return self._unique_with_inverse(values, uniques) - return self._unique_no_inverse(values, uniques) - - def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, - object na_value=None): - """ - Calculate unique values and labels (no sorting!) - - Missing values are not included in the "uniques" for this method. - The labels for any missing values will be set to "na_sentinel" - - Parameters - ---------- - values : ndarray[object] - Array of values of which unique will be calculated - na_sentinel : Py_ssize_t, default -1 - Sentinel value used for all NA-values in inverse - na_value : object, default None - Value to identify as missing. If na_value is None, then None _plus_ - any value "val" satisfying val != val is considered missing. - If na_value is not None, then _additionally_, any value "val" - satisfying val == na_value is considered missing. - - Returns - ------- - labels : ndarray[int64] - The labels from values to uniques - uniques : ndarray[object] - Unique values of input, not sorted - """ - # reduced signature compared to _factorize - # not necessary to have uniques-vector, count_prior - uniques = ObjectVector() - return self._factorize(values, uniques, 0, na_sentinel, na_value) - -{{py: -# tuples of "func_name, return_inverse, ignore_na" -unique_funcs = [('_unique_no_inverse', False, False), - ('_unique_with_inverse', True, False), - ('_factorize', True, True), - ('get_labels', True, True)] -}} - -{{for func_name, return_inverse, ignore_na in unique_funcs}} - @cython.boundscheck(False) @cython.wraparound(False) - def {{func_name}}(self, ndarray[object] values, ObjectVector uniques, - Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None): + def _unique(self, ndarray[object] values, ObjectVector uniques, + bint ignore_na=False, bint return_inverse=False, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None): """ Calculate unique values and labels (no sorting!) -{{if ignore_na}} - - Missing values are not included in the "uniques" for this method. - The labels for any missing values will be set to "na_sentinel" -{{endif}} Parameters ---------- @@ -1084,6 +971,13 @@ unique_funcs = [('_unique_no_inverse', False, False), Array of values of which unique will be calculated uniques : ObjectVector Vector into which uniques will be written + ignore_na : boolean, default False + Whether NA-values should be ignored for calculating the uniques. If + True, the labels corresponding to missing values will be set to + na_sentinel. + return_inverse : boolean, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. count_prior : Py_ssize_t, default 0 Number of existing entries in uniques na_sentinel : Py_ssize_t, default -1 @@ -1096,23 +990,10 @@ unique_funcs = [('_unique_no_inverse', False, False), Returns ------- -{{if func_name == '_unique_no_inverse'}} uniques : ndarray[object] Unique values of input, not sorted -{{elif func_name == '_unique_with_inverse'}} - uniques : ndarray[object] - Unique values of input, not sorted - labels : ndarray[int64] + labels : ndarray[int64] (if return_inverse=True) The labels from values to uniques -{{elif func_name == '_factorize' # switched output order for factorize}} - labels : ndarray[int64] - The labels from values to uniques - uniques : ndarray[object] - Unique values of input, not sorted -{{elif func_name == 'get_labels'}} - labels : ndarray[int64] - The labels from values to uniques -{{endif}} """ cdef: Py_ssize_t i, idx, count = count_prior, n = len(values) @@ -1122,7 +1003,7 @@ unique_funcs = [('_unique_no_inverse', False, False), khiter_t k bint use_na_value - if {{return_inverse}}: + if return_inverse: labels = np.empty(n, dtype=np.int64) use_na_value = na_value is not None @@ -1130,8 +1011,8 @@ unique_funcs = [('_unique_no_inverse', False, False), val = values[i] hash(val) - if {{ignore_na}} and ((val != val or val is None) - or (use_na_value and val == na_value)): + if ignore_na and ((val != val or val is None) + or (use_na_value and val == na_value)): labels[i] = na_sentinel continue @@ -1140,24 +1021,80 @@ unique_funcs = [('_unique_no_inverse', False, False), # k hasn't been seen yet k = kh_put_pymap(self.table, val, &ret) uniques.append(val) - if {{return_inverse}}: + if return_inverse: self.table.vals[k] = count labels[i] = count count += 1 - elif {{return_inverse}}: + elif return_inverse: # k falls into a previous bucket # only relevant in case we need to construct the inverse idx = self.table.vals[k] labels[i] = idx -{{if func_name == '_unique_no_inverse'}} + if return_inverse: + return uniques.to_array(), np.asarray(labels) return uniques.to_array() -{{elif func_name == '_unique_with_inverse'}} - return uniques.to_array(), np.asarray(labels) -{{elif func_name == '_factorize'}} - return np.asarray(labels), uniques.to_array() -{{elif func_name == 'get_labels'}} - return np.asarray(labels) -{{endif}} -{{endfor}} + def unique(self, ndarray[object] values, bint return_inverse=False): + """ + Calculate unique values and labels (no sorting!) + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + return_inverse : boolean, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[int64] (if return_inverse) + The labels from values to uniques + """ + uniques = ObjectVector() + return self._unique(values, uniques, ignore_na=False, + return_inverse=return_inverse) + + def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, + object na_value=None): + """ + Calculate unique values and labels (no sorting!) + + Missing values are not included in the "uniques" for this method. + The labels for any missing values will be set to "na_sentinel" + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + na_sentinel : Py_ssize_t, default -1 + Sentinel value used for all NA-values in inverse + na_value : object, default None + Value to identify as missing. If na_value is None, then None _plus_ + any value "val" satisfying val != val is considered missing. + If na_value is not None, then _additionally_, any value "val" + satisfying val == na_value is considered missing. + + Returns + ------- + labels : ndarray[int64] + The labels from values to uniques + uniques : ndarray[object] + Unique values of input, not sorted + """ + uniques = ObjectVector() + # factorize has reversed outputs compared to _unique (see "[::-1]") + return self._unique(values, uniques, ignore_na=True, + return_inverse=True, na_sentinel=na_sentinel, + na_value=na_value)[::-1] + + def get_labels(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None): + _, labels = self._unique(values, uniques, ignore_na=True, + return_inverse=True, count_prior=count_prior, + na_sentinel=na_sentinel, na_value=na_value) + return labels From a06494e33237e9a478340f5b5bcb4d006ff4f7a0 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sat, 3 Nov 2018 16:50:52 +0100 Subject: [PATCH 08/17] Add new kwargs at the end (review jreback) --- pandas/_libs/hashtable_class_helper.pxi.in | 108 +++++++++++---------- 1 file changed, 57 insertions(+), 51 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 99fc3337c73f6..ed2e6b9e65d3d 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -357,9 +357,9 @@ cdef class {{name}}HashTable(HashTable): @cython.boundscheck(False) @cython.wraparound(False) def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, - bint ignore_na=False, bint return_inverse=False, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None): + object na_value=None, bint ignore_na=False, + bint return_inverse=False): """ Calculate unique values and labels (no sorting!) @@ -369,13 +369,6 @@ cdef class {{name}}HashTable(HashTable): Array of values of which unique will be calculated uniques : {{name}}Vector Vector into which uniques will be written - ignore_na : boolean, default False - Whether NA-values should be ignored for calculating the uniques. If - True, the labels corresponding to missing values will be set to - na_sentinel. - return_inverse : boolean, default False - Whether the mapping of the original array values to their location - in the vector of uniques should be returned. count_prior : Py_ssize_t, default 0 Number of existing entries in uniques na_sentinel : Py_ssize_t, default -1 @@ -385,6 +378,13 @@ cdef class {{name}}HashTable(HashTable): any value "val" satisfying val != val is considered missing. If na_value is not None, then _additionally_, any value "val" satisfying val == na_value is considered missing. + ignore_na : boolean, default False + Whether NA-values should be ignored for calculating the uniques. If + True, the labels corresponding to missing values will be set to + na_sentinel. + return_inverse : boolean, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. Returns ------- @@ -504,18 +504,20 @@ cdef class {{name}}HashTable(HashTable): uniques : ndarray[{{dtype}}] Unique values of input, not sorted """ - uniques = {{name}}Vector() - # factorize has reversed outputs compared to _unique (see "[::-1]") - return self._unique(values, uniques, ignore_na=True, - return_inverse=True, na_sentinel=na_sentinel, - na_value=na_value)[::-1] + uniques_vector = {{name}}Vector() + uniques, labels = self._unique(values, uniques_vector, + na_sentinel=na_sentinel, + na_value=na_value, ignore_na=True, + return_inverse=True) + # factorize has reversed outputs compared to _unique + return labels, uniques def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): - _, labels = self._unique(values, uniques, ignore_na=True, - return_inverse=True, count_prior=count_prior, - na_sentinel=na_sentinel, na_value=na_value) + _, labels = self._unique(values, uniques, count_prior=count_prior, + na_sentinel=na_sentinel, na_value=na_value, + ignore_na=True, return_inverse=True) return labels @cython.boundscheck(False) @@ -706,9 +708,9 @@ cdef class StringHashTable(HashTable): @cython.boundscheck(False) @cython.wraparound(False) def _unique(self, ndarray[object] values, ObjectVector uniques, - bint ignore_na=False, bint return_inverse=False, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None): + object na_value=None, bint ignore_na=False, + bint return_inverse=False): """ Calculate unique values and labels (no sorting!) @@ -718,13 +720,6 @@ cdef class StringHashTable(HashTable): Array of values of which unique will be calculated uniques : ObjectVector Vector into which uniques will be written - ignore_na : boolean, default False - Whether NA-values should be ignored for calculating the uniques. If - True, the labels corresponding to missing values will be set to - na_sentinel. - return_inverse : boolean, default False - Whether the mapping of the original array values to their location - in the vector of uniques should be returned. count_prior : Py_ssize_t, default 0 Number of existing entries in uniques na_sentinel : Py_ssize_t, default -1 @@ -734,6 +729,13 @@ cdef class StringHashTable(HashTable): that is not a string is considered missing. If na_value is not None, then _additionally_ any value "val" satisfying val == na_value is considered missing. + ignore_na : boolean, default False + Whether NA-values should be ignored for calculating the uniques. If + True, the labels corresponding to missing values will be set to + na_sentinel. + return_inverse : boolean, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. Returns ------- @@ -855,18 +857,20 @@ cdef class StringHashTable(HashTable): uniques : ndarray[object] Unique values of input, not sorted """ - uniques = ObjectVector() - # factorize has reversed outputs compared to _unique (see "[::-1]") - return self._unique(values, uniques, ignore_na=True, - return_inverse=True, na_sentinel=na_sentinel, - na_value=na_value)[::-1] + uniques_vector = ObjectVector() + uniques, labels = self._unique(values, uniques_vector, + na_sentinel=na_sentinel, + na_value=na_value, ignore_na=True, + return_inverse=True) + # factorize has reversed outputs compared to _unique + return labels, uniques def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): - _, labels = self._unique(values, uniques, ignore_na=True, - return_inverse=True, count_prior=count_prior, - na_sentinel=na_sentinel, na_value=na_value) + _, labels = self._unique(values, uniques, count_prior=count_prior, + na_sentinel=na_sentinel, na_value=na_value, + ignore_na=True, return_inverse=True) return labels @@ -958,9 +962,9 @@ cdef class PyObjectHashTable(HashTable): @cython.boundscheck(False) @cython.wraparound(False) def _unique(self, ndarray[object] values, ObjectVector uniques, - bint ignore_na=False, bint return_inverse=False, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None): + object na_value=None, bint ignore_na=False, + bint return_inverse=False): """ Calculate unique values and labels (no sorting!) @@ -970,13 +974,6 @@ cdef class PyObjectHashTable(HashTable): Array of values of which unique will be calculated uniques : ObjectVector Vector into which uniques will be written - ignore_na : boolean, default False - Whether NA-values should be ignored for calculating the uniques. If - True, the labels corresponding to missing values will be set to - na_sentinel. - return_inverse : boolean, default False - Whether the mapping of the original array values to their location - in the vector of uniques should be returned. count_prior : Py_ssize_t, default 0 Number of existing entries in uniques na_sentinel : Py_ssize_t, default -1 @@ -986,6 +983,13 @@ cdef class PyObjectHashTable(HashTable): any value "val" satisfying val != val is considered missing. If na_value is not None, then _additionally_, any value "val" satisfying val == na_value is considered missing. + ignore_na : boolean, default False + Whether NA-values should be ignored for calculating the uniques. If + True, the labels corresponding to missing values will be set to + na_sentinel. + return_inverse : boolean, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. Returns ------- @@ -1084,16 +1088,18 @@ cdef class PyObjectHashTable(HashTable): uniques : ndarray[object] Unique values of input, not sorted """ - uniques = ObjectVector() - # factorize has reversed outputs compared to _unique (see "[::-1]") - return self._unique(values, uniques, ignore_na=True, - return_inverse=True, na_sentinel=na_sentinel, - na_value=na_value)[::-1] + uniques_vector = ObjectVector() + uniques, labels = self._unique(values, uniques_vector, + na_sentinel=na_sentinel, + na_value=na_value, ignore_na=True, + return_inverse=True) + # factorize has reversed outputs compared to _unique + return labels, uniques def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): - _, labels = self._unique(values, uniques, ignore_na=True, - return_inverse=True, count_prior=count_prior, - na_sentinel=na_sentinel, na_value=na_value) + _, labels = self._unique(values, uniques, count_prior=count_prior, + na_sentinel=na_sentinel, na_value=na_value, + ignore_na=True, return_inverse=True) return labels From 906a2b9fa52c808a2f56661afa2ceb8d9915c7f8 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 11 Nov 2018 11:14:32 +0100 Subject: [PATCH 09/17] Retrigger CircleCI From 29aecdd1d223b63df1505d946cd900c333be5265 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 11 Nov 2018 18:16:46 +0100 Subject: [PATCH 10/17] Retrigger CI after flaky hypothesis test From 746c0e393876ca974d96067a626125622b72070b Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 11 Nov 2018 19:14:36 +0100 Subject: [PATCH 11/17] Retrigger CircleCI From 8da33f4ab68ee1b9fd10806172b9ffc6c8392b0e Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 11 Nov 2018 22:36:23 +0100 Subject: [PATCH 12/17] Retrigger CI after timeout From ba9d8b805d57080969e71567b8df89e35e87afb2 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 11 Nov 2018 22:38:13 +0100 Subject: [PATCH 13/17] Retrigger CircleCI From 30de418936cda9aa9c26a8cfa4c3a0b42906e2b2 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 18 Nov 2018 14:49:23 +0100 Subject: [PATCH 14/17] Always calculate inverse --- pandas/_libs/hashtable_class_helper.pxi.in | 106 ++++++++------------- 1 file changed, 41 insertions(+), 65 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index c26e1e5d102d7..cf85a9f20e2c5 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -358,8 +358,7 @@ cdef class {{name}}HashTable(HashTable): @cython.wraparound(False) def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None, bint ignore_na=False, - bint return_inverse=False): + object na_value=None, bint ignore_na=False): """ Calculate unique values and labels (no sorting!) @@ -382,15 +381,12 @@ cdef class {{name}}HashTable(HashTable): Whether NA-values should be ignored for calculating the uniques. If True, the labels corresponding to missing values will be set to na_sentinel. - return_inverse : boolean, default False - Whether the mapping of the original array values to their location - in the vector of uniques should be returned. Returns ------- uniques : ndarray[{{dtype}}] Unique values of input, not sorted - labels : ndarray[int64] (if return_inverse=True) + labels : ndarray[int64] The labels from values to uniques """ cdef: @@ -402,8 +398,7 @@ cdef class {{name}}HashTable(HashTable): {{name}}VectorData *ud bint use_na_value - if return_inverse: - labels = np.empty(n, dtype=np.int64) + labels = np.empty(n, dtype=np.int64) ud = uniques.data use_na_value = na_value is not None @@ -440,19 +435,15 @@ cdef class {{name}}HashTable(HashTable): "Vector.resize() needed") uniques.resize() append_data_{{dtype}}(ud, val) - if return_inverse: - self.table.vals[k] = count - labels[i] = count - count += 1 - elif return_inverse: + self.table.vals[k] = count + labels[i] = count + count += 1 + else: # k falls into a previous bucket - # only relevant in case we need to construct the inverse idx = self.table.vals[k] labels[i] = idx - if return_inverse: - return uniques.to_array(), np.asarray(labels) - return uniques.to_array() + return uniques.to_array(), np.asarray(labels) def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False): """ @@ -474,8 +465,10 @@ cdef class {{name}}HashTable(HashTable): The labels from values to uniques """ uniques = {{name}}Vector() - return self._unique(values, uniques, ignore_na=False, - return_inverse=return_inverse) + uniques, inverse = self._unique(values, uniques, ignore_na=False) + if return_inverse: + return uniques, inverse + return uniques def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1, object na_value=None): @@ -507,8 +500,7 @@ cdef class {{name}}HashTable(HashTable): uniques_vector = {{name}}Vector() uniques, labels = self._unique(values, uniques_vector, na_sentinel=na_sentinel, - na_value=na_value, ignore_na=True, - return_inverse=True) + na_value=na_value, ignore_na=True) # factorize has reversed outputs compared to _unique return labels, uniques @@ -517,7 +509,7 @@ cdef class {{name}}HashTable(HashTable): object na_value=None): _, labels = self._unique(values, uniques, count_prior=count_prior, na_sentinel=na_sentinel, na_value=na_value, - ignore_na=True, return_inverse=True) + ignore_na=True) return labels @cython.boundscheck(False) @@ -709,8 +701,7 @@ cdef class StringHashTable(HashTable): @cython.wraparound(False) def _unique(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None, bint ignore_na=False, - bint return_inverse=False): + object na_value=None, bint ignore_na=False): """ Calculate unique values and labels (no sorting!) @@ -733,15 +724,12 @@ cdef class StringHashTable(HashTable): Whether NA-values should be ignored for calculating the uniques. If True, the labels corresponding to missing values will be set to na_sentinel. - return_inverse : boolean, default False - Whether the mapping of the original array values to their location - in the vector of uniques should be returned. Returns ------- uniques : ndarray[object] Unique values of input, not sorted - labels : ndarray[int64] (if return_inverse=True) + labels : ndarray[int64] The labels from values to uniques """ cdef: @@ -755,8 +743,7 @@ cdef class StringHashTable(HashTable): khiter_t k bint use_na_value - if return_inverse: - labels = np.zeros(n, dtype=np.int64) + labels = np.zeros(n, dtype=np.int64) uindexer = np.empty(n, dtype=np.int64) use_na_value = na_value is not None @@ -787,13 +774,11 @@ cdef class StringHashTable(HashTable): # k hasn't been seen yet k = kh_put_str(self.table, v, &ret) uindexer[count] = i - if return_inverse: - self.table.vals[k] = count - labels[i] = count + self.table.vals[k] = count + labels[i] = count count += 1 - elif return_inverse: + else: # k falls into a previous bucket - # only relevant in case we need to construct the inverse idx = self.table.vals[k] labels[i] = idx @@ -803,9 +788,7 @@ cdef class StringHashTable(HashTable): for i in range(count): uniques.append(values[uindexer[i]]) - if return_inverse: - return uniques.to_array(), np.asarray(labels) - return uniques.to_array() + return uniques.to_array(), np.asarray(labels) def unique(self, ndarray[object] values, bint return_inverse=False): """ @@ -827,8 +810,10 @@ cdef class StringHashTable(HashTable): The labels from values to uniques """ uniques = ObjectVector() - return self._unique(values, uniques, ignore_na=False, - return_inverse=return_inverse) + uniques, inverse = self._unique(values, uniques, ignore_na=False) + if return_inverse: + return uniques, inverse + return uniques def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, object na_value=None): @@ -860,8 +845,7 @@ cdef class StringHashTable(HashTable): uniques_vector = ObjectVector() uniques, labels = self._unique(values, uniques_vector, na_sentinel=na_sentinel, - na_value=na_value, ignore_na=True, - return_inverse=True) + na_value=na_value, ignore_na=True) # factorize has reversed outputs compared to _unique return labels, uniques @@ -870,7 +854,7 @@ cdef class StringHashTable(HashTable): object na_value=None): _, labels = self._unique(values, uniques, count_prior=count_prior, na_sentinel=na_sentinel, na_value=na_value, - ignore_na=True, return_inverse=True) + ignore_na=True) return labels @@ -963,8 +947,7 @@ cdef class PyObjectHashTable(HashTable): @cython.wraparound(False) def _unique(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None, bint ignore_na=False, - bint return_inverse=False): + object na_value=None, bint ignore_na=False): """ Calculate unique values and labels (no sorting!) @@ -987,15 +970,12 @@ cdef class PyObjectHashTable(HashTable): Whether NA-values should be ignored for calculating the uniques. If True, the labels corresponding to missing values will be set to na_sentinel. - return_inverse : boolean, default False - Whether the mapping of the original array values to their location - in the vector of uniques should be returned. Returns ------- uniques : ndarray[object] Unique values of input, not sorted - labels : ndarray[int64] (if return_inverse=True) + labels : ndarray[int64] The labels from values to uniques """ cdef: @@ -1006,8 +986,7 @@ cdef class PyObjectHashTable(HashTable): khiter_t k bint use_na_value - if return_inverse: - labels = np.empty(n, dtype=np.int64) + labels = np.empty(n, dtype=np.int64) use_na_value = na_value is not None for i in range(n): @@ -1024,19 +1003,15 @@ cdef class PyObjectHashTable(HashTable): # k hasn't been seen yet k = kh_put_pymap(self.table, val, &ret) uniques.append(val) - if return_inverse: - self.table.vals[k] = count - labels[i] = count - count += 1 - elif return_inverse: + self.table.vals[k] = count + labels[i] = count + count += 1 + else: # k falls into a previous bucket - # only relevant in case we need to construct the inverse idx = self.table.vals[k] labels[i] = idx - if return_inverse: - return uniques.to_array(), np.asarray(labels) - return uniques.to_array() + return uniques.to_array(), np.asarray(labels) def unique(self, ndarray[object] values, bint return_inverse=False): """ @@ -1058,8 +1033,10 @@ cdef class PyObjectHashTable(HashTable): The labels from values to uniques """ uniques = ObjectVector() - return self._unique(values, uniques, ignore_na=False, - return_inverse=return_inverse) + uniques, inverse = self._unique(values, uniques, ignore_na=False) + if return_inverse: + return uniques, inverse + return uniques def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, object na_value=None): @@ -1091,8 +1068,7 @@ cdef class PyObjectHashTable(HashTable): uniques_vector = ObjectVector() uniques, labels = self._unique(values, uniques_vector, na_sentinel=na_sentinel, - na_value=na_value, ignore_na=True, - return_inverse=True) + na_value=na_value, ignore_na=True) # factorize has reversed outputs compared to _unique return labels, uniques @@ -1101,5 +1077,5 @@ cdef class PyObjectHashTable(HashTable): object na_value=None): _, labels = self._unique(values, uniques, count_prior=count_prior, na_sentinel=na_sentinel, na_value=na_value, - ignore_na=True, return_inverse=True) + ignore_na=True) return labels From 0b85759f9c1ec9d2a5f7f242972d34e2d9704476 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 19 Nov 2018 23:36:48 +0100 Subject: [PATCH 15/17] Revert "Always calculate inverse" This reverts commit 30de418936cda9aa9c26a8cfa4c3a0b42906e2b2. --- pandas/_libs/hashtable_class_helper.pxi.in | 106 +++++++++++++-------- 1 file changed, 65 insertions(+), 41 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index cf85a9f20e2c5..c26e1e5d102d7 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -358,7 +358,8 @@ cdef class {{name}}HashTable(HashTable): @cython.wraparound(False) def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None, bint ignore_na=False): + object na_value=None, bint ignore_na=False, + bint return_inverse=False): """ Calculate unique values and labels (no sorting!) @@ -381,12 +382,15 @@ cdef class {{name}}HashTable(HashTable): Whether NA-values should be ignored for calculating the uniques. If True, the labels corresponding to missing values will be set to na_sentinel. + return_inverse : boolean, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. Returns ------- uniques : ndarray[{{dtype}}] Unique values of input, not sorted - labels : ndarray[int64] + labels : ndarray[int64] (if return_inverse=True) The labels from values to uniques """ cdef: @@ -398,7 +402,8 @@ cdef class {{name}}HashTable(HashTable): {{name}}VectorData *ud bint use_na_value - labels = np.empty(n, dtype=np.int64) + if return_inverse: + labels = np.empty(n, dtype=np.int64) ud = uniques.data use_na_value = na_value is not None @@ -435,15 +440,19 @@ cdef class {{name}}HashTable(HashTable): "Vector.resize() needed") uniques.resize() append_data_{{dtype}}(ud, val) - self.table.vals[k] = count - labels[i] = count - count += 1 - else: + if return_inverse: + self.table.vals[k] = count + labels[i] = count + count += 1 + elif return_inverse: # k falls into a previous bucket + # only relevant in case we need to construct the inverse idx = self.table.vals[k] labels[i] = idx - return uniques.to_array(), np.asarray(labels) + if return_inverse: + return uniques.to_array(), np.asarray(labels) + return uniques.to_array() def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False): """ @@ -465,10 +474,8 @@ cdef class {{name}}HashTable(HashTable): The labels from values to uniques """ uniques = {{name}}Vector() - uniques, inverse = self._unique(values, uniques, ignore_na=False) - if return_inverse: - return uniques, inverse - return uniques + return self._unique(values, uniques, ignore_na=False, + return_inverse=return_inverse) def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1, object na_value=None): @@ -500,7 +507,8 @@ cdef class {{name}}HashTable(HashTable): uniques_vector = {{name}}Vector() uniques, labels = self._unique(values, uniques_vector, na_sentinel=na_sentinel, - na_value=na_value, ignore_na=True) + na_value=na_value, ignore_na=True, + return_inverse=True) # factorize has reversed outputs compared to _unique return labels, uniques @@ -509,7 +517,7 @@ cdef class {{name}}HashTable(HashTable): object na_value=None): _, labels = self._unique(values, uniques, count_prior=count_prior, na_sentinel=na_sentinel, na_value=na_value, - ignore_na=True) + ignore_na=True, return_inverse=True) return labels @cython.boundscheck(False) @@ -701,7 +709,8 @@ cdef class StringHashTable(HashTable): @cython.wraparound(False) def _unique(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None, bint ignore_na=False): + object na_value=None, bint ignore_na=False, + bint return_inverse=False): """ Calculate unique values and labels (no sorting!) @@ -724,12 +733,15 @@ cdef class StringHashTable(HashTable): Whether NA-values should be ignored for calculating the uniques. If True, the labels corresponding to missing values will be set to na_sentinel. + return_inverse : boolean, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. Returns ------- uniques : ndarray[object] Unique values of input, not sorted - labels : ndarray[int64] + labels : ndarray[int64] (if return_inverse=True) The labels from values to uniques """ cdef: @@ -743,7 +755,8 @@ cdef class StringHashTable(HashTable): khiter_t k bint use_na_value - labels = np.zeros(n, dtype=np.int64) + if return_inverse: + labels = np.zeros(n, dtype=np.int64) uindexer = np.empty(n, dtype=np.int64) use_na_value = na_value is not None @@ -774,11 +787,13 @@ cdef class StringHashTable(HashTable): # k hasn't been seen yet k = kh_put_str(self.table, v, &ret) uindexer[count] = i - self.table.vals[k] = count - labels[i] = count + if return_inverse: + self.table.vals[k] = count + labels[i] = count count += 1 - else: + elif return_inverse: # k falls into a previous bucket + # only relevant in case we need to construct the inverse idx = self.table.vals[k] labels[i] = idx @@ -788,7 +803,9 @@ cdef class StringHashTable(HashTable): for i in range(count): uniques.append(values[uindexer[i]]) - return uniques.to_array(), np.asarray(labels) + if return_inverse: + return uniques.to_array(), np.asarray(labels) + return uniques.to_array() def unique(self, ndarray[object] values, bint return_inverse=False): """ @@ -810,10 +827,8 @@ cdef class StringHashTable(HashTable): The labels from values to uniques """ uniques = ObjectVector() - uniques, inverse = self._unique(values, uniques, ignore_na=False) - if return_inverse: - return uniques, inverse - return uniques + return self._unique(values, uniques, ignore_na=False, + return_inverse=return_inverse) def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, object na_value=None): @@ -845,7 +860,8 @@ cdef class StringHashTable(HashTable): uniques_vector = ObjectVector() uniques, labels = self._unique(values, uniques_vector, na_sentinel=na_sentinel, - na_value=na_value, ignore_na=True) + na_value=na_value, ignore_na=True, + return_inverse=True) # factorize has reversed outputs compared to _unique return labels, uniques @@ -854,7 +870,7 @@ cdef class StringHashTable(HashTable): object na_value=None): _, labels = self._unique(values, uniques, count_prior=count_prior, na_sentinel=na_sentinel, na_value=na_value, - ignore_na=True) + ignore_na=True, return_inverse=True) return labels @@ -947,7 +963,8 @@ cdef class PyObjectHashTable(HashTable): @cython.wraparound(False) def _unique(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None, bint ignore_na=False): + object na_value=None, bint ignore_na=False, + bint return_inverse=False): """ Calculate unique values and labels (no sorting!) @@ -970,12 +987,15 @@ cdef class PyObjectHashTable(HashTable): Whether NA-values should be ignored for calculating the uniques. If True, the labels corresponding to missing values will be set to na_sentinel. + return_inverse : boolean, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. Returns ------- uniques : ndarray[object] Unique values of input, not sorted - labels : ndarray[int64] + labels : ndarray[int64] (if return_inverse=True) The labels from values to uniques """ cdef: @@ -986,7 +1006,8 @@ cdef class PyObjectHashTable(HashTable): khiter_t k bint use_na_value - labels = np.empty(n, dtype=np.int64) + if return_inverse: + labels = np.empty(n, dtype=np.int64) use_na_value = na_value is not None for i in range(n): @@ -1003,15 +1024,19 @@ cdef class PyObjectHashTable(HashTable): # k hasn't been seen yet k = kh_put_pymap(self.table, val, &ret) uniques.append(val) - self.table.vals[k] = count - labels[i] = count - count += 1 - else: + if return_inverse: + self.table.vals[k] = count + labels[i] = count + count += 1 + elif return_inverse: # k falls into a previous bucket + # only relevant in case we need to construct the inverse idx = self.table.vals[k] labels[i] = idx - return uniques.to_array(), np.asarray(labels) + if return_inverse: + return uniques.to_array(), np.asarray(labels) + return uniques.to_array() def unique(self, ndarray[object] values, bint return_inverse=False): """ @@ -1033,10 +1058,8 @@ cdef class PyObjectHashTable(HashTable): The labels from values to uniques """ uniques = ObjectVector() - uniques, inverse = self._unique(values, uniques, ignore_na=False) - if return_inverse: - return uniques, inverse - return uniques + return self._unique(values, uniques, ignore_na=False, + return_inverse=return_inverse) def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, object na_value=None): @@ -1068,7 +1091,8 @@ cdef class PyObjectHashTable(HashTable): uniques_vector = ObjectVector() uniques, labels = self._unique(values, uniques_vector, na_sentinel=na_sentinel, - na_value=na_value, ignore_na=True) + na_value=na_value, ignore_na=True, + return_inverse=True) # factorize has reversed outputs compared to _unique return labels, uniques @@ -1077,5 +1101,5 @@ cdef class PyObjectHashTable(HashTable): object na_value=None): _, labels = self._unique(values, uniques, count_prior=count_prior, na_sentinel=na_sentinel, na_value=na_value, - ignore_na=True) + ignore_na=True, return_inverse=True) return labels From 44518545259e6415f198501b3450bc189c8a6aec Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Wed, 21 Nov 2018 22:43:02 +0100 Subject: [PATCH 16/17] Add comments to ignore_na branches (review jreback) --- pandas/_libs/hashtable_class_helper.pxi.in | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index c26e1e5d102d7..a0aef8808f664 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -423,6 +423,9 @@ cdef class {{name}}HashTable(HashTable): if ignore_na and (val != val or (use_na_value and val == na_value2)): + # if missing values do not count as unique values (i.e. if + # ignore_na is True), skip the hashtable entry for them, + # and replace the corresponding label with na_sentinel labels[i] = na_sentinel continue @@ -768,7 +771,9 @@ cdef class StringHashTable(HashTable): if (ignore_na and (not isinstance(val, (str, unicode)) or (use_na_value and val == na_value))): - # missing value + # if missing values do not count as unique values (i.e. if + # ignore_na is True), we can skip the actual value, and + # replace the label with na_sentinel directly labels[i] = na_sentinel else: # if ignore_na is False, we also stringify NaN/None/etc. @@ -779,6 +784,7 @@ cdef class StringHashTable(HashTable): with nogil: for i in range(n): if ignore_na and labels[i] == na_sentinel: + # skip entries for ignored missing values (see above) continue v = vecs[i] @@ -1016,6 +1022,9 @@ cdef class PyObjectHashTable(HashTable): if ignore_na and ((val != val or val is None) or (use_na_value and val == na_value)): + # if missing values do not count as unique values (i.e. if + # ignore_na is True), skip the hashtable entry for them, and + # replace the corresponding label with na_sentinel labels[i] = na_sentinel continue From 00a304d467483135ad2c035d218a562fbdbeecd4 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Tue, 27 Nov 2018 08:07:09 +0100 Subject: [PATCH 17/17] Switch signature of hashtable.factorize (review jreback) --- pandas/_libs/hashtable_class_helper.pxi.in | 39 +++++++++------------- pandas/core/algorithms.py | 2 +- pandas/tests/test_algos.py | 2 +- 3 files changed, 17 insertions(+), 26 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index a0aef8808f664..7f4c2a6410870 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -502,18 +502,15 @@ cdef class {{name}}HashTable(HashTable): Returns ------- - labels : ndarray[int64] - The labels from values to uniques uniques : ndarray[{{dtype}}] Unique values of input, not sorted + labels : ndarray[int64] + The labels from values to uniques """ uniques_vector = {{name}}Vector() - uniques, labels = self._unique(values, uniques_vector, - na_sentinel=na_sentinel, - na_value=na_value, ignore_na=True, - return_inverse=True) - # factorize has reversed outputs compared to _unique - return labels, uniques + return self._unique(values, uniques_vector, na_sentinel=na_sentinel, + na_value=na_value, ignore_na=True, + return_inverse=True) def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, @@ -858,18 +855,15 @@ cdef class StringHashTable(HashTable): Returns ------- - labels : ndarray[int64] - The labels from values to uniques uniques : ndarray[object] Unique values of input, not sorted + labels : ndarray[int64] + The labels from values to uniques """ uniques_vector = ObjectVector() - uniques, labels = self._unique(values, uniques_vector, - na_sentinel=na_sentinel, - na_value=na_value, ignore_na=True, - return_inverse=True) - # factorize has reversed outputs compared to _unique - return labels, uniques + return self._unique(values, uniques_vector, na_sentinel=na_sentinel, + na_value=na_value, ignore_na=True, + return_inverse=True) def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, @@ -1092,18 +1086,15 @@ cdef class PyObjectHashTable(HashTable): Returns ------- - labels : ndarray[int64] - The labels from values to uniques uniques : ndarray[object] Unique values of input, not sorted + labels : ndarray[int64] + The labels from values to uniques """ uniques_vector = ObjectVector() - uniques, labels = self._unique(values, uniques_vector, - na_sentinel=na_sentinel, - na_value=na_value, ignore_na=True, - return_inverse=True) - # factorize has reversed outputs compared to _unique - return labels, uniques + return self._unique(values, uniques_vector, na_sentinel=na_sentinel, + na_value=na_value, ignore_na=True, + return_inverse=True) def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 5f7995ac649a2..98cb45a4d4efc 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -460,7 +460,7 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None, (hash_klass, _), values = _get_data_algo(values, _hashtables) table = hash_klass(size_hint or len(values)) - labels, uniques = table.factorize(values, na_sentinel=na_sentinel, + uniques, labels = table.factorize(values, na_sentinel=na_sentinel, na_value=na_value) labels = ensure_platform_int(labels) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index c69eb056138c8..c9d403f6696af 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1391,7 +1391,7 @@ def test_hashtable_factorize(self, htable, tm_dtype, writable): s_duplicated.values.setflags(write=writable) na_mask = s_duplicated.isna().values - result_inverse, result_unique = htable().factorize(s_duplicated.values) + result_unique, result_inverse = htable().factorize(s_duplicated.values) # drop_duplicates has own cython code (hash_table_func_helper.pxi) # and is tested separately; keeps first occurrence like ht.factorize()