Skip to content

CLN: factorize returns ndarray[intp], not int64 #40474

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Mar 20, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 18 additions & 8 deletions pandas/_libs/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -66,20 +66,28 @@ cdef class Factorizer:
self.uniques = ObjectVector()
self.count = 0

def get_count(self):
def get_count(self) -> int:
return self.count

def factorize(
self, ndarray[object] values, sort=False, na_sentinel=-1, na_value=None
):
) -> np.ndarray:
"""

Returns
-------
np.ndarray[np.intp]

Examples
--------
Factorize values with nans replaced by na_sentinel

>>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
array([ 0, 1, 20])
"""
cdef:
ndarray[intp_t] labels

if self.uniques.external_view_exists:
uniques = ObjectVector()
uniques.extend(self.uniques.to_array())
Expand All @@ -89,8 +97,6 @@ cdef class Factorizer:
mask = (labels == na_sentinel)
# sort on
if sort:
if labels.dtype != np.intp:
labels = labels.astype(np.intp)
sorter = self.uniques.to_array().argsort()
reverse_indexer = np.empty(len(sorter), dtype=np.intp)
reverse_indexer.put(sorter, np.arange(len(sorter)))
Expand Down Expand Up @@ -119,15 +125,22 @@ cdef class Int64Factorizer:
return self.count

def factorize(self, const int64_t[:] values, sort=False,
na_sentinel=-1, na_value=None):
na_sentinel=-1, na_value=None) -> np.ndarray:
"""
Returns
-------
ndarray[intp_t]

Examples
--------
Factorize values with nans replaced by na_sentinel

>>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
array([ 0, 1, 20])
"""
cdef:
ndarray[intp_t] labels

if self.uniques.external_view_exists:
uniques = Int64Vector()
uniques.extend(self.uniques.to_array())
Expand All @@ -138,9 +151,6 @@ cdef class Int64Factorizer:

# sort on
if sort:
if labels.dtype != np.intp:
labels = labels.astype(np.intp)

sorter = self.uniques.to_array().argsort()
reverse_indexer = np.empty(len(sorter), dtype=np.intp)
reverse_indexer.put(sorter, np.arange(len(sorter)))
Expand Down
36 changes: 18 additions & 18 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -539,12 +539,12 @@ cdef class {{name}}HashTable(HashTable):
-------
uniques : ndarray[{{dtype}}]
Unique values of input, not sorted
labels : ndarray[int64] (if return_inverse=True)
labels : ndarray[intp_t] (if return_inverse=True)
The labels from values to uniques
"""
cdef:
Py_ssize_t i, idx, count = count_prior, n = len(values)
int64_t[:] labels
intp_t[:] labels
int ret = 0
{{c_type}} val, na_value2
khiter_t k
Expand All @@ -553,7 +553,7 @@ cdef class {{name}}HashTable(HashTable):
uint8_t[:] mask_values

if return_inverse:
labels = np.empty(n, dtype=np.int64)
labels = np.empty(n, dtype=np.intp)
ud = uniques.data
use_na_value = na_value is not None
use_mask = mask is not None
Expand Down Expand Up @@ -614,7 +614,7 @@ cdef class {{name}}HashTable(HashTable):
labels[i] = idx

if return_inverse:
return uniques.to_array(), np.asarray(labels)
return uniques.to_array(), labels.base # .base -> underlying ndarray
return uniques.to_array()

def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
Expand All @@ -633,7 +633,7 @@ cdef class {{name}}HashTable(HashTable):
-------
uniques : ndarray[{{dtype}}]
Unique values of input, not sorted
labels : ndarray[int64] (if return_inverse)
labels : ndarray[intp_t] (if return_inverse)
The labels from values to uniques
"""
uniques = {{name}}Vector()
Expand Down Expand Up @@ -668,7 +668,7 @@ cdef class {{name}}HashTable(HashTable):
-------
uniques : ndarray[{{dtype}}]
Unique values of input, not sorted
labels : ndarray[int64]
labels : ndarray[intp_t]
The labels from values to uniques
"""
uniques_vector = {{name}}Vector()
Expand Down Expand Up @@ -918,12 +918,12 @@ cdef class StringHashTable(HashTable):
-------
uniques : ndarray[object]
Unique values of input, not sorted
labels : ndarray[int64] (if return_inverse=True)
labels : ndarray[intp_t] (if return_inverse=True)
The labels from values to uniques
"""
cdef:
Py_ssize_t i, idx, count = count_prior, n = len(values)
int64_t[:] labels
intp_t[:] labels
int64_t[:] uindexer
int ret = 0
object val
Expand All @@ -933,7 +933,7 @@ cdef class StringHashTable(HashTable):
bint use_na_value

if return_inverse:
labels = np.zeros(n, dtype=np.int64)
labels = np.zeros(n, dtype=np.intp)
uindexer = np.empty(n, dtype=np.int64)
use_na_value = na_value is not None

Expand Down Expand Up @@ -972,13 +972,13 @@ cdef class StringHashTable(HashTable):
uindexer[count] = i
if return_inverse:
self.table.vals[k] = count
labels[i] = <int64_t>count
labels[i] = count
count += 1
elif return_inverse:
# k falls into a previous bucket
# only relevant in case we need to construct the inverse
idx = self.table.vals[k]
labels[i] = <int64_t>idx
labels[i] = idx

free(vecs)

Expand All @@ -987,7 +987,7 @@ cdef class StringHashTable(HashTable):
uniques.append(values[uindexer[i]])

if return_inverse:
return uniques.to_array(), np.asarray(labels)
return uniques.to_array(), labels.base # .base -> underlying ndarray
return uniques.to_array()

def unique(self, ndarray[object] values, bint return_inverse=False):
Expand Down Expand Up @@ -1193,19 +1193,19 @@ cdef class PyObjectHashTable(HashTable):
-------
uniques : ndarray[object]
Unique values of input, not sorted
labels : ndarray[int64] (if return_inverse=True)
labels : ndarray[intp_t] (if return_inverse=True)
The labels from values to uniques
"""
cdef:
Py_ssize_t i, idx, count = count_prior, n = len(values)
int64_t[:] labels
intp_t[:] labels
int ret = 0
object val
khiter_t k
bint use_na_value

if return_inverse:
labels = np.empty(n, dtype=np.int64)
labels = np.empty(n, dtype=np.intp)
use_na_value = na_value is not None

for i in range(n):
Expand Down Expand Up @@ -1240,7 +1240,7 @@ cdef class PyObjectHashTable(HashTable):
labels[i] = idx

if return_inverse:
return uniques.to_array(), np.asarray(labels)
return uniques.to_array(), labels.base # .base -> underlying ndarray
return uniques.to_array()

def unique(self, ndarray[object] values, bint return_inverse=False):
Expand All @@ -1259,7 +1259,7 @@ cdef class PyObjectHashTable(HashTable):
-------
uniques : ndarray[object]
Unique values of input, not sorted
labels : ndarray[int64] (if return_inverse)
labels : ndarray[intp_t] (if return_inverse)
The labels from values to uniques
"""
uniques = ObjectVector()
Expand Down Expand Up @@ -1292,7 +1292,7 @@ cdef class PyObjectHashTable(HashTable):
-------
uniques : ndarray[object]
Unique values of input, not sorted
labels : ndarray[int64]
labels : ndarray[intp_t]
The labels from values to uniques
"""
uniques_vector = ObjectVector()
Expand Down
20 changes: 11 additions & 9 deletions pandas/_libs/join.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ from numpy cimport (
int16_t,
int32_t,
int64_t,
intp_t,
ndarray,
uint8_t,
uint16_t,
Expand All @@ -20,14 +21,15 @@ from numpy cimport (
cnp.import_array()

from pandas._libs.algos import (
ensure_int64,
ensure_platform_int,
groupsort_indexer,
take_1d_int64_int64,
)


@cython.boundscheck(False)
def inner_join(const int64_t[:] left, const int64_t[:] right,
def inner_join(const intp_t[:] left, const intp_t[:] right,
Py_ssize_t max_groups):
cdef:
Py_ssize_t i, j, k, count = 0
Expand All @@ -39,8 +41,8 @@ def inner_join(const int64_t[:] left, const int64_t[:] right,

# NA group in location 0

left_sorter, left_count = groupsort_indexer(left, max_groups)
right_sorter, right_count = groupsort_indexer(right, max_groups)
left_sorter, left_count = groupsort_indexer(ensure_int64(left), max_groups)
right_sorter, right_count = groupsort_indexer(ensure_int64(right), max_groups)

with nogil:
# First pass, determine size of result set, do not use the NA group
Expand Down Expand Up @@ -78,7 +80,7 @@ def inner_join(const int64_t[:] left, const int64_t[:] right,


@cython.boundscheck(False)
def left_outer_join(const int64_t[:] left, const int64_t[:] right,
def left_outer_join(const intp_t[:] left, const intp_t[:] right,
Py_ssize_t max_groups, bint sort=True):
cdef:
Py_ssize_t i, j, k, count = 0
Expand All @@ -91,8 +93,8 @@ def left_outer_join(const int64_t[:] left, const int64_t[:] right,

# NA group in location 0

left_sorter, left_count = groupsort_indexer(left, max_groups)
right_sorter, right_count = groupsort_indexer(right, max_groups)
left_sorter, left_count = groupsort_indexer(ensure_int64(left), max_groups)
right_sorter, right_count = groupsort_indexer(ensure_int64(right), max_groups)

with nogil:
# First pass, determine size of result set, do not use the NA group
Expand Down Expand Up @@ -151,7 +153,7 @@ def left_outer_join(const int64_t[:] left, const int64_t[:] right,


@cython.boundscheck(False)
def full_outer_join(const int64_t[:] left, const int64_t[:] right,
def full_outer_join(const intp_t[:] left, const intp_t[:] right,
Py_ssize_t max_groups):
cdef:
Py_ssize_t i, j, k, count = 0
Expand All @@ -163,8 +165,8 @@ def full_outer_join(const int64_t[:] left, const int64_t[:] right,

# NA group in location 0

left_sorter, left_count = groupsort_indexer(left, max_groups)
right_sorter, right_count = groupsort_indexer(right, max_groups)
left_sorter, left_count = groupsort_indexer(ensure_int64(left), max_groups)
right_sorter, right_count = groupsort_indexer(ensure_int64(right), max_groups)

with nogil:
# First pass, determine size of result set, do not use the NA group
Expand Down
15 changes: 10 additions & 5 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1973,7 +1973,7 @@ def _get_single_indexer(join_key, index, sort: bool = False):
left_key, right_key, count = _factorize_keys(join_key, index, sort=sort)

left_indexer, right_indexer = libjoin.left_outer_join(
ensure_int64(left_key), ensure_int64(right_key), count, sort=sort
left_key, right_key, count, sort=sort
)

return left_indexer, right_indexer
Expand Down Expand Up @@ -2029,9 +2029,9 @@ def _factorize_keys(
Returns
-------
array
np.ndarray[np.intp]
Left (resp. right if called with `key='right'`) labels, as enumerated type.
array
np.ndarray[np.intp]
Right (resp. left if called with `key='right'`) labels, as enumerated type.
int
Number of unique elements in union of left and right labels.
Expand Down Expand Up @@ -2117,6 +2117,8 @@ def _factorize_keys(

llab = rizer.factorize(lk)
rlab = rizer.factorize(rk)
assert llab.dtype == np.intp, llab.dtype
assert rlab.dtype == np.intp, rlab.dtype

count = rizer.get_count()

Expand All @@ -2142,13 +2144,16 @@ def _factorize_keys(
return llab, rlab, count


def _sort_labels(uniques: np.ndarray, left, right):
def _sort_labels(
uniques: np.ndarray, left: np.ndarray, right: np.ndarray
) -> tuple[np.ndarray, np.ndarray]:
# Both returned ndarrays are np.intp

llength = len(left)
labels = np.concatenate([left, right])

_, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1)
new_labels = ensure_int64(new_labels)
assert new_labels.dtype == np.intp
new_left, new_right = new_labels[:llength], new_labels[llength:]

return new_left, new_right
Expand Down
16 changes: 8 additions & 8 deletions pandas/tests/libs/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ def test_outer_join_indexer(self, dtype):
tm.assert_numpy_array_equal(rindexer, exp)

def test_cython_left_outer_join(self):
left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
right = np.array([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.intp)
right = np.array([1, 1, 0, 4, 2, 2, 1], dtype=np.intp)
max_group = 5

ls, rs = left_outer_join(left, right, max_group)
Expand All @@ -70,8 +70,8 @@ def test_cython_left_outer_join(self):
tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)

def test_cython_right_outer_join(self):
left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
right = np.array([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.intp)
right = np.array([1, 1, 0, 4, 2, 2, 1], dtype=np.intp)
max_group = 5

rs, ls = left_outer_join(right, left, max_group)
Expand Down Expand Up @@ -116,8 +116,8 @@ def test_cython_right_outer_join(self):
tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)

def test_cython_inner_join(self):
left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64)
right = np.array([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64)
left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.intp)
right = np.array([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.intp)
max_group = 5

ls, rs = inner_join(left, right, max_group)
Expand Down Expand Up @@ -256,10 +256,10 @@ def test_left_outer_join_bug():
0,
2,
],
dtype=np.int64,
dtype=np.intp,
)

right = np.array([3, 1], dtype=np.int64)
right = np.array([3, 1], dtype=np.intp)
max_groups = 4

lidx, ridx = libjoin.left_outer_join(left, right, max_groups, sort=False)
Expand Down