-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
CLN: prepare unifying hashtable.factorize and .unique; add doc-strings #22986
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 19 commits
640162f
31d0dc5
c5e5147
9918d52
52ae84e
dbe4e0e
8481e19
27ceb4d
f5cd5e9
b1705a9
a6ed5dd
17752ce
19eaf32
ce7626f
7b9014f
471c4da
9d45378
00b2ccb
8687315
a267d4a
7f1bb40
9593992
08d7f50
d91be98
e27ec9a
d825be0
1a342d0
28e0441
d65e4fd
bca615c
3438727
facc111
6d0e86b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -355,19 +355,38 @@ cdef class {{name}}HashTable(HashTable): | |||
|
||||
return np.asarray(locs) | ||||
|
||||
def factorize(self, {{dtype}}_t values): | ||||
uniques = {{name}}Vector() | ||||
labels = self.get_labels(values, uniques, 0, 0) | ||||
return uniques.to_array(), labels | ||||
|
||||
@cython.boundscheck(False) | ||||
def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, | ||||
Py_ssize_t count_prior, Py_ssize_t na_sentinel, | ||||
object na_value=None): | ||||
@cython.wraparound(False) | ||||
def _unique_with_inverse(self, const {{dtype}}_t[:] values, | ||||
{{name}}Vector uniques, Py_ssize_t count_prior=0, | ||||
Py_ssize_t na_sentinel=-1, object na_value=None): | ||||
""" | ||||
Calculate unique values and labels (no sorting); ignores all NA-values | ||||
|
||||
Parameters | ||||
---------- | ||||
values : ndarray[{{dtype}}] | ||||
Array of values of which unique will be calculated | ||||
uniques : {{name}}Vector | ||||
Vector into which uniques will be written | ||||
count_prior : Py_ssize_t, default 0 | ||||
Number of existing entries in uniques | ||||
na_sentinel : Py_ssize_t, default -1 | ||||
Sentinel value used for all NA-values in inverse | ||||
na_value : object, default None | ||||
Value to identify as missing. If na_value is None, then | ||||
any value satisfying val!=val are considered missing. | ||||
|
||||
Returns | ||||
------- | ||||
uniques : ndarray[{{dtype}}] | ||||
Unique values of input, not sorted | ||||
labels : ndarray[int64] | ||||
The labels from values to uniques | ||||
""" | ||||
cdef: | ||||
Py_ssize_t i, n = len(values) | ||||
Py_ssize_t i, idx, count = count_prior, n = len(values) | ||||
int64_t[:] labels | ||||
Py_ssize_t idx, count = count_prior | ||||
int ret = 0 | ||||
{{dtype}}_t val, na_value2 | ||||
khiter_t k | ||||
|
@@ -399,9 +418,11 @@ cdef class {{name}}HashTable(HashTable): | |||
k = kh_get_{{dtype}}(self.table, val) | ||||
|
||||
if k != self.table.n_buckets: | ||||
# k falls into a previous bucket | ||||
idx = self.table.vals[k] | ||||
labels[i] = idx | ||||
else: | ||||
# k hasn't been seen yet | ||||
k = kh_put_{{dtype}}(self.table, val, &ret) | ||||
self.table.vals[k] = count | ||||
|
||||
|
@@ -416,7 +437,20 @@ cdef class {{name}}HashTable(HashTable): | |||
labels[i] = count | ||||
count += 1 | ||||
|
||||
return np.asarray(labels) | ||||
return uniques.to_array(), np.asarray(labels) | ||||
|
||||
def factorize(self, {{dtype}}_t[:] values): | ||||
uniques = {{name}}Vector() | ||||
return self._unique_with_inverse(values, uniques=uniques) | ||||
|
||||
def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, | ||||
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, | ||||
object na_value=None): | ||||
_, labels = self._unique_with_inverse(values, uniques, | ||||
count_prior=count_prior, | ||||
na_sentinel=na_sentinel, | ||||
na_value=na_value) | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it actually needed to still define this? From a quick search it seems There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jorisvandenbossche There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The only use case is in pandas/pandas/core/algorithms.py Line 475 in c8ce3d0
Which can directly use the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jorisvandenbossche There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, it could. But it is here that you are changing |
||||
return labels | ||||
|
||||
@cython.boundscheck(False) | ||||
def get_labels_groupby(self, const {{dtype}}_t[:] values): | ||||
|
@@ -464,7 +498,21 @@ cdef class {{name}}HashTable(HashTable): | |||
return np.asarray(labels), arr_uniques | ||||
|
||||
@cython.boundscheck(False) | ||||
@cython.wraparound(False) | ||||
def unique(self, const {{dtype}}_t[:] values): | ||||
""" | ||||
Calculate unique values without sorting | ||||
|
||||
Parameters | ||||
---------- | ||||
values : ndarray[object] | ||||
Array of values of which unique will be calculated | ||||
|
||||
Returns | ||||
------- | ||||
uniques : ndarray[object] | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should be dtype here |
||||
Unique values of input, not sorted | ||||
""" | ||||
cdef: | ||||
Py_ssize_t i, n = len(values) | ||||
int ret = 0 | ||||
|
@@ -567,7 +615,21 @@ cdef class StringHashTable(HashTable): | |||
return labels | ||||
|
||||
@cython.boundscheck(False) | ||||
@cython.wraparound(False) | ||||
def unique(self, ndarray[object] values): | ||||
""" | ||||
Calculate unique values without sorting | ||||
|
||||
Parameters | ||||
---------- | ||||
values : ndarray[object] | ||||
Array of values of which unique will be calculated | ||||
|
||||
Returns | ||||
------- | ||||
uniques : ndarray[object] | ||||
Unique values of input, not sorted | ||||
""" | ||||
cdef: | ||||
Py_ssize_t i, count, n = len(values) | ||||
int64_t[:] uindexer | ||||
|
@@ -602,11 +664,6 @@ cdef class StringHashTable(HashTable): | |||
uniques.append(values[uindexer[i]]) | ||||
return uniques.to_array() | ||||
|
||||
def factorize(self, ndarray[object] values): | ||||
uniques = ObjectVector() | ||||
labels = self.get_labels(values, uniques, 0, 0) | ||||
return uniques.to_array(), labels | ||||
|
||||
@cython.boundscheck(False) | ||||
def lookup(self, ndarray[object] values): | ||||
cdef: | ||||
|
@@ -669,34 +726,55 @@ cdef class StringHashTable(HashTable): | |||
free(vecs) | ||||
|
||||
@cython.boundscheck(False) | ||||
def get_labels(self, ndarray[object] values, ObjectVector uniques, | ||||
Py_ssize_t count_prior, int64_t na_sentinel, | ||||
object na_value=None): | ||||
@cython.wraparound(False) | ||||
def _unique_with_inverse(self, ndarray[object] values, | ||||
ObjectVector uniques, Py_ssize_t count_prior=0, | ||||
Py_ssize_t na_sentinel=-1, object na_value=None): | ||||
""" | ||||
Calculate unique values and labels (no sorting); ignores all NA-values | ||||
|
||||
Parameters | ||||
---------- | ||||
values : ndarray[object] | ||||
Array of values of which unique will be calculated | ||||
uniques : ObjectVector | ||||
Vector into which uniques will be written | ||||
count_prior : Py_ssize_t, default 0 | ||||
Number of existing entries in uniques | ||||
na_sentinel : Py_ssize_t, default -1 | ||||
Sentinel value used for all NA-values in inverse | ||||
na_value : object, default None | ||||
Value to identify as missing | ||||
|
||||
Returns | ||||
------- | ||||
uniques : ndarray[object] | ||||
Unique values of input, not sorted | ||||
labels : ndarray[int64] | ||||
The labels from values to uniques | ||||
""" | ||||
cdef: | ||||
Py_ssize_t i, n = len(values) | ||||
Py_ssize_t i, idx, count = count_prior, n = len(values) | ||||
int64_t[:] labels | ||||
int64_t[:] uindexer | ||||
Py_ssize_t idx, count = count_prior | ||||
int ret = 0 | ||||
object val | ||||
const char *v | ||||
const char **vecs | ||||
khiter_t k | ||||
bint use_na_value | ||||
|
||||
# these by-definition *must* be strings | ||||
labels = np.zeros(n, dtype=np.int64) | ||||
uindexer = np.empty(n, dtype=np.int64) | ||||
use_na_value = na_value is not None | ||||
|
||||
# pre-filter out missing | ||||
# and assign pointers | ||||
# assign pointers and pre-filter out missing | ||||
vecs = <const char **> malloc(n * sizeof(char *)) | ||||
for i in range(n): | ||||
val = values[i] | ||||
|
||||
if ((PyUnicode_Check(val) or PyString_Check(val)) and | ||||
not (use_na_value and val == na_value)): | ||||
if ((PyUnicode_Check(val) or PyString_Check(val)) | ||||
and not (use_na_value and val == na_value)): | ||||
v = util.get_c_string(val) | ||||
vecs[i] = v | ||||
else: | ||||
|
@@ -711,9 +789,11 @@ cdef class StringHashTable(HashTable): | |||
v = vecs[i] | ||||
k = kh_get_str(self.table, v) | ||||
if k != self.table.n_buckets: | ||||
# k falls into a previous bucket | ||||
idx = self.table.vals[k] | ||||
labels[i] = <int64_t>idx | ||||
else: | ||||
# k hasn't been seen yet | ||||
k = kh_put_str(self.table, v, &ret) | ||||
self.table.vals[k] = count | ||||
uindexer[count] = i | ||||
|
@@ -726,7 +806,20 @@ cdef class StringHashTable(HashTable): | |||
for i in range(count): | ||||
uniques.append(values[uindexer[i]]) | ||||
|
||||
return np.asarray(labels) | ||||
return uniques.to_array(), np.asarray(labels) | ||||
|
||||
def factorize(self, ndarray[object] values): | ||||
uniques = ObjectVector() | ||||
return self._unique_with_inverse(values, uniques=uniques) | ||||
|
||||
def get_labels(self, ndarray[object] values, ObjectVector uniques, | ||||
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, | ||||
object na_value=None): | ||||
_, labels = self._unique_with_inverse(values, uniques, | ||||
count_prior=count_prior, | ||||
na_sentinel=na_sentinel, | ||||
na_value=na_value) | ||||
return labels | ||||
|
||||
|
||||
cdef class PyObjectHashTable(HashTable): | ||||
|
@@ -814,7 +907,22 @@ cdef class PyObjectHashTable(HashTable): | |||
|
||||
return np.asarray(locs) | ||||
|
||||
@cython.boundscheck(False) | ||||
@cython.wraparound(False) | ||||
def unique(self, ndarray[object] values): | ||||
""" | ||||
Calculate unique values without sorting | ||||
|
||||
Parameters | ||||
---------- | ||||
values : ndarray[object] | ||||
Array of values of which unique will be calculated | ||||
|
||||
Returns | ||||
------- | ||||
uniques : ndarray[object] | ||||
Unique values of input, not sorted | ||||
""" | ||||
cdef: | ||||
Py_ssize_t i, n = len(values) | ||||
int ret = 0 | ||||
|
@@ -832,13 +940,38 @@ cdef class PyObjectHashTable(HashTable): | |||
|
||||
return uniques.to_array() | ||||
|
||||
def get_labels(self, ndarray[object] values, ObjectVector uniques, | ||||
Py_ssize_t count_prior, int64_t na_sentinel, | ||||
object na_value=None): | ||||
@cython.boundscheck(False) | ||||
@cython.wraparound(False) | ||||
def _unique_with_inverse(self, ndarray[object] values, | ||||
ObjectVector uniques, Py_ssize_t count_prior=0, | ||||
Py_ssize_t na_sentinel=-1, object na_value=None): | ||||
""" | ||||
Calculate unique values and labels (no sorting); ignores all NA-values | ||||
|
||||
Parameters | ||||
---------- | ||||
values : ndarray[object] | ||||
Array of values of which unique will be calculated | ||||
uniques : ObjectVector | ||||
Vector into which uniques will be written | ||||
count_prior : Py_ssize_t, default 0 | ||||
Number of existing entries in uniques | ||||
na_sentinel : Py_ssize_t, default -1 | ||||
Sentinel value used for all NA-values in inverse | ||||
na_value : object, default None | ||||
Value to identify as missing. If na_value is None, then None _plus_ | ||||
any value satisfying val!=val are considered missing. | ||||
|
||||
Returns | ||||
------- | ||||
uniques : ndarray[object] | ||||
Unique values of input, not sorted | ||||
labels : ndarray[int64] | ||||
The labels from values to uniques | ||||
""" | ||||
cdef: | ||||
Py_ssize_t i, n = len(values) | ||||
Py_ssize_t i, idx, count = count_prior, n = len(values) | ||||
int64_t[:] labels | ||||
Py_ssize_t idx, count = count_prior | ||||
int ret = 0 | ||||
object val | ||||
khiter_t k | ||||
|
@@ -851,20 +984,35 @@ cdef class PyObjectHashTable(HashTable): | |||
val = values[i] | ||||
hash(val) | ||||
|
||||
if ((val != val or val is None) or | ||||
(use_na_value and val == na_value)): | ||||
if ((val != val or val is None) | ||||
or (use_na_value and val == na_value)): | ||||
labels[i] = na_sentinel | ||||
continue | ||||
|
||||
k = kh_get_pymap(self.table, <PyObject*>val) | ||||
if k != self.table.n_buckets: | ||||
# k falls into a previous bucket | ||||
idx = self.table.vals[k] | ||||
labels[i] = idx | ||||
else: | ||||
# k hasn't been seen yet | ||||
k = kh_put_pymap(self.table, <PyObject*>val, &ret) | ||||
self.table.vals[k] = count | ||||
uniques.append(val) | ||||
labels[i] = count | ||||
count += 1 | ||||
|
||||
return np.asarray(labels) | ||||
return uniques.to_array(), np.asarray(labels) | ||||
|
||||
def factorize(self, ndarray[object] values): | ||||
uniques = ObjectVector() | ||||
return self._unique_with_inverse(values, uniques=uniques) | ||||
|
||||
def get_labels(self, ndarray[object] values, ObjectVector uniques, | ||||
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, | ||||
object na_value=None): | ||||
_, labels = self._unique_with_inverse(values, uniques, | ||||
count_prior=count_prior, | ||||
na_sentinel=na_sentinel, | ||||
na_value=na_value) | ||||
return labels |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
minor comment: since what this algo does is what we typically call "factorize" in pandas, I would call this function "_factorize"
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jorisvandenbossche
Can we keep this for the follow-up please? It will make sense there, because then
factorize = unique_with_inverse(ignore_na=True)
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't understand your point here.
hashtable.factorize
already exists?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IMO it makes sense to do that here, because it is here you are changing
get_labels
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry, I thought this was a reply to the other comment :-)
Yes, and otherwise the other way around
unique(return_inverse=True) = factorize(ignore_na=False)
. The implemenation of that function is what factorize does in pandas (and not unique), so it makes sense to me to use that name.