-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
CLN: prepare unifying hashtable.factorize and .unique; add doc-strings #22986
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 11 commits
640162f
31d0dc5
c5e5147
9918d52
52ae84e
dbe4e0e
8481e19
27ceb4d
f5cd5e9
b1705a9
a6ed5dd
17752ce
19eaf32
ce7626f
7b9014f
471c4da
9d45378
00b2ccb
8687315
a267d4a
7f1bb40
9593992
08d7f50
d91be98
e27ec9a
d825be0
1a342d0
28e0441
d65e4fd
bca615c
3438727
facc111
6d0e86b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -355,19 +355,15 @@ cdef class {{name}}HashTable(HashTable): | |||
|
||||
return np.asarray(locs) | ||||
|
||||
def factorize(self, {{dtype}}_t values): | ||||
uniques = {{name}}Vector() | ||||
labels = self.get_labels(values, uniques, 0, 0) | ||||
return uniques.to_array(), labels | ||||
|
||||
@cython.boundscheck(False) | ||||
def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, | ||||
Py_ssize_t count_prior, Py_ssize_t na_sentinel, | ||||
object na_value=None): | ||||
@cython.wraparound(False) | ||||
def _unique_with_inverse(self, const {{dtype}}_t[:] values, | ||||
{{name}}Vector uniques, bint ignore_na=False, | ||||
Py_ssize_t count_prior=0, | ||||
Py_ssize_t na_sentinel=-1, object na_value=None): | ||||
cdef: | ||||
Py_ssize_t i, n = len(values) | ||||
Py_ssize_t i, idx, count = count_prior, n = len(values) | ||||
int64_t[:] labels | ||||
Py_ssize_t idx, count = count_prior | ||||
int ret = 0 | ||||
{{dtype}}_t val, na_value2 | ||||
khiter_t k | ||||
|
@@ -392,16 +388,19 @@ cdef class {{name}}HashTable(HashTable): | |||
for i in range(n): | ||||
val = values[i] | ||||
|
||||
if val != val or (use_na_value and val == na_value2): | ||||
if ignore_na and (val != val | ||||
or (use_na_value and val == na_value2)): | ||||
labels[i] = na_sentinel | ||||
continue | ||||
|
||||
k = kh_get_{{dtype}}(self.table, val) | ||||
|
||||
if k != self.table.n_buckets: | ||||
# k falls into a previous bucket | ||||
idx = self.table.vals[k] | ||||
labels[i] = idx | ||||
else: | ||||
# k hasn't been seen yet | ||||
k = kh_put_{{dtype}}(self.table, val, &ret) | ||||
self.table.vals[k] = count | ||||
|
||||
|
@@ -416,7 +415,26 @@ cdef class {{name}}HashTable(HashTable): | |||
labels[i] = count | ||||
count += 1 | ||||
|
||||
return np.asarray(labels) | ||||
return uniques.to_array(), np.asarray(labels) | ||||
|
||||
def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False): | ||||
if return_inverse: | ||||
return self._unique_with_inverse(values, uniques={{name}}Vector(), | ||||
ignore_na=False) | ||||
return self._unique_no_inverse(values) | ||||
|
||||
def factorize(self, {{dtype}}_t[:] values): | ||||
return self._unique_with_inverse(values, uniques={{name}}Vector(), | ||||
ignore_na=True) | ||||
|
||||
def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, | ||||
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, | ||||
object na_value=None): | ||||
_, labels = self._unique_with_inverse(values, uniques, ignore_na=True, | ||||
count_prior=count_prior, | ||||
na_sentinel=na_sentinel, | ||||
na_value=na_value) | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it actually needed to still define this? From a quick search it seems There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jorisvandenbossche There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The only use case is in pandas/pandas/core/algorithms.py Line 475 in c8ce3d0
Which can directly use the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jorisvandenbossche There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, it could. But it is here that you are changing |
||||
return labels | ||||
|
||||
@cython.boundscheck(False) | ||||
def get_labels_groupby(self, const {{dtype}}_t[:] values): | ||||
|
@@ -464,7 +482,9 @@ cdef class {{name}}HashTable(HashTable): | |||
return np.asarray(labels), arr_uniques | ||||
|
||||
@cython.boundscheck(False) | ||||
def unique(self, const {{dtype}}_t[:] values): | ||||
@cython.wraparound(False) | ||||
def _unique_no_inverse(self, const {{dtype}}_t[:] values): | ||||
# define separate functions without inverse for performance | ||||
cdef: | ||||
Py_ssize_t i, n = len(values) | ||||
int ret = 0 | ||||
|
@@ -567,7 +587,9 @@ cdef class StringHashTable(HashTable): | |||
return labels | ||||
|
||||
@cython.boundscheck(False) | ||||
def unique(self, ndarray[object] values): | ||||
@cython.wraparound(False) | ||||
def _unique_no_inverse(self, ndarray[object] values): | ||||
# define separate functions without inverse for performance | ||||
cdef: | ||||
Py_ssize_t i, count, n = len(values) | ||||
int64_t[:] uindexer | ||||
|
@@ -602,11 +624,6 @@ cdef class StringHashTable(HashTable): | |||
uniques.append(values[uindexer[i]]) | ||||
return uniques.to_array() | ||||
|
||||
def factorize(self, ndarray[object] values): | ||||
uniques = ObjectVector() | ||||
labels = self.get_labels(values, uniques, 0, 0) | ||||
return uniques.to_array(), labels | ||||
|
||||
@cython.boundscheck(False) | ||||
def lookup(self, ndarray[object] values): | ||||
cdef: | ||||
|
@@ -669,34 +686,34 @@ cdef class StringHashTable(HashTable): | |||
free(vecs) | ||||
|
||||
@cython.boundscheck(False) | ||||
def get_labels(self, ndarray[object] values, ObjectVector uniques, | ||||
Py_ssize_t count_prior, int64_t na_sentinel, | ||||
object na_value=None): | ||||
@cython.wraparound(False) | ||||
def _unique_with_inverse(self, ndarray[object] values, | ||||
ObjectVector uniques, bint ignore_na=False, | ||||
Py_ssize_t count_prior=0, | ||||
Py_ssize_t na_sentinel=-1, object na_value=None): | ||||
cdef: | ||||
Py_ssize_t i, n = len(values) | ||||
Py_ssize_t i, idx, count = count_prior, n = len(values) | ||||
int64_t[:] labels | ||||
int64_t[:] uindexer | ||||
Py_ssize_t idx, count = count_prior | ||||
int ret = 0 | ||||
object val | ||||
const char *v | ||||
const char **vecs | ||||
khiter_t k | ||||
bint use_na_value | ||||
|
||||
# these by-definition *must* be strings | ||||
labels = np.zeros(n, dtype=np.int64) | ||||
uindexer = np.empty(n, dtype=np.int64) | ||||
use_na_value = na_value is not None | ||||
|
||||
# pre-filter out missing | ||||
# and assign pointers | ||||
# assign pointers and pre-filter out missing (if ignore_na) | ||||
vecs = <const char **> malloc(n * sizeof(char *)) | ||||
for i in range(n): | ||||
val = values[i] | ||||
|
||||
if ((PyUnicode_Check(val) or PyString_Check(val)) and | ||||
not (use_na_value and val == na_value)): | ||||
if not ignore_na or ((PyUnicode_Check(val) or PyString_Check(val)) | ||||
and not (use_na_value and val == na_value)): | ||||
# if ignore_na is False, we also stringify NaN/None/etc. | ||||
v = util.get_c_string(val) | ||||
vecs[i] = v | ||||
else: | ||||
|
@@ -705,15 +722,17 @@ cdef class StringHashTable(HashTable): | |||
# compute | ||||
with nogil: | ||||
for i in range(n): | ||||
if labels[i] == na_sentinel: | ||||
if ignore_na and labels[i] == na_sentinel: | ||||
continue | ||||
|
||||
v = vecs[i] | ||||
k = kh_get_str(self.table, v) | ||||
if k != self.table.n_buckets: | ||||
# k falls into a previous bucket | ||||
idx = self.table.vals[k] | ||||
labels[i] = <int64_t>idx | ||||
else: | ||||
# k hasn't been seen yet | ||||
k = kh_put_str(self.table, v, &ret) | ||||
self.table.vals[k] = count | ||||
uindexer[count] = i | ||||
|
@@ -726,7 +745,26 @@ cdef class StringHashTable(HashTable): | |||
for i in range(count): | ||||
uniques.append(values[uindexer[i]]) | ||||
|
||||
return np.asarray(labels) | ||||
return uniques.to_array(), np.asarray(labels) | ||||
|
||||
def unique(self, ndarray[object] values, bint return_inverse=False): | ||||
if return_inverse: | ||||
return self._unique_with_inverse(values, uniques=ObjectVector(), | ||||
ignore_na=False) | ||||
return self._unique_no_inverse(values) | ||||
|
||||
def factorize(self, ndarray[object] values): | ||||
return self._unique_with_inverse(values, uniques=ObjectVector(), | ||||
ignore_na=True) | ||||
|
||||
def get_labels(self, ndarray[object] values, ObjectVector uniques, | ||||
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, | ||||
object na_value=None): | ||||
_, labels = self._unique_with_inverse(values, uniques, ignore_na=True, | ||||
count_prior=count_prior, | ||||
na_sentinel=na_sentinel, | ||||
na_value=na_value) | ||||
return labels | ||||
|
||||
|
||||
cdef class PyObjectHashTable(HashTable): | ||||
|
@@ -814,7 +852,10 @@ cdef class PyObjectHashTable(HashTable): | |||
|
||||
return np.asarray(locs) | ||||
|
||||
def unique(self, ndarray[object] values): | ||||
@cython.boundscheck(False) | ||||
@cython.wraparound(False) | ||||
def _unique_no_inverse(self, ndarray[object] values): | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i am bothered by the naming, call these just |
||||
# define separate functions without inverse for performance | ||||
cdef: | ||||
Py_ssize_t i, n = len(values) | ||||
int ret = 0 | ||||
|
@@ -832,13 +873,15 @@ cdef class PyObjectHashTable(HashTable): | |||
|
||||
return uniques.to_array() | ||||
|
||||
def get_labels(self, ndarray[object] values, ObjectVector uniques, | ||||
Py_ssize_t count_prior, int64_t na_sentinel, | ||||
object na_value=None): | ||||
@cython.boundscheck(False) | ||||
@cython.wraparound(False) | ||||
def _unique_with_inverse(self, ndarray[object] values, | ||||
ObjectVector uniques, bint ignore_na=False, | ||||
Py_ssize_t count_prior=0, | ||||
Py_ssize_t na_sentinel=-1, object na_value=None): | ||||
cdef: | ||||
Py_ssize_t i, n = len(values) | ||||
Py_ssize_t i, idx, count = count_prior, n = len(values) | ||||
int64_t[:] labels | ||||
Py_ssize_t idx, count = count_prior | ||||
int ret = 0 | ||||
object val | ||||
khiter_t k | ||||
|
@@ -851,20 +894,40 @@ cdef class PyObjectHashTable(HashTable): | |||
val = values[i] | ||||
hash(val) | ||||
|
||||
if ((val != val or val is None) or | ||||
(use_na_value and val == na_value)): | ||||
if ignore_na and ((val != val or val is None) | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this keyword is completely untested There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would argue that it is (at least for Do we explicitly test cython code? I can if necessary. This PR does the required preparations on cython level to continue with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes of course, you added a keyword. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am talking about the ignore_na keyword There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I know that I added this keyword to switch between Can you point me to where the cython hashtable code is tested? I could add some tests there. That's why I asked "Do we explicitly test cython code?", because I haven't come across it yet. Otherwise, I would add copious tests for the behaviour as soon as the keyword makes its way into the pandas API, but this is just the preparation for that. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @h-vetinari : IMO If it is possible to hit those keyword arguments from Python land, then do add some tests. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's drop the ignore_na for now. and add it when I see that you actually need it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jreback |
||||
or (use_na_value and val == na_value)): | ||||
labels[i] = na_sentinel | ||||
continue | ||||
|
||||
k = kh_get_pymap(self.table, <PyObject*>val) | ||||
if k != self.table.n_buckets: | ||||
# k falls into a previous bucket | ||||
idx = self.table.vals[k] | ||||
labels[i] = idx | ||||
else: | ||||
# k hasn't been seen yet | ||||
k = kh_put_pymap(self.table, <PyObject*>val, &ret) | ||||
self.table.vals[k] = count | ||||
uniques.append(val) | ||||
labels[i] = count | ||||
count += 1 | ||||
|
||||
return np.asarray(labels) | ||||
return uniques.to_array(), np.asarray(labels) | ||||
|
||||
def unique(self, ndarray[object] values, bint return_inverse=False): | ||||
if return_inverse: | ||||
return self._unique_with_inverse(values, uniques=ObjectVector(), | ||||
ignore_na=False) | ||||
return self._unique_no_inverse(values) | ||||
|
||||
def factorize(self, ndarray[object] values): | ||||
return self._unique_with_inverse(values, uniques=ObjectVector(), ignore_na=True) | ||||
|
||||
def get_labels(self, ndarray[object] values, ObjectVector uniques, | ||||
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, | ||||
object na_value=None): | ||||
_, labels = self._unique_with_inverse(values, uniques, ignore_na=True, | ||||
count_prior=count_prior, | ||||
na_sentinel=na_sentinel, | ||||
na_value=na_value) | ||||
return labels |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
minor comment: since what this algo does is what we typically call "factorize" in pandas, I would call this function "_factorize"
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jorisvandenbossche
Can we keep this for the follow-up please? It will make sense there, because then
factorize = unique_with_inverse(ignore_na=True)
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't understand your point here.
hashtable.factorize
already exists?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IMO it makes sense to do that here, because it is here you are changing
get_labels
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry, I thought this was a reply to the other comment :-)
Yes, and otherwise the other way around
unique(return_inverse=True) = factorize(ignore_na=False)
. The implemenation of that function is what factorize does in pandas (and not unique), so it makes sense to me to use that name.