diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index 18ebc1ff2bd1f..eb0b46101c2d8 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -15,29 +15,65 @@ class Factorizer: count: int def __init__(self, size_hint: int) -> None: ... def get_count(self) -> int: ... - -class ObjectFactorizer(Factorizer): - table: PyObjectHashTable - uniques: ObjectVector def factorize( self, - values: npt.NDArray[np.object_], + values: np.ndarray, sort: bool = ..., na_sentinel=..., na_value=..., ) -> npt.NDArray[np.intp]: ... +class ObjectFactorizer(Factorizer): + table: PyObjectHashTable + uniques: ObjectVector + class Int64Factorizer(Factorizer): table: Int64HashTable uniques: Int64Vector - def factorize( - self, - values: np.ndarray, # const int64_t[:] - sort: bool = ..., - na_sentinel=..., - na_value=..., - mask=..., - ) -> npt.NDArray[np.intp]: ... + +class UInt64Factorizer(Factorizer): + table: UInt64HashTable + uniques: UInt64Vector + +class Int32Factorizer(Factorizer): + table: Int32HashTable + uniques: Int32Vector + +class UInt32Factorizer(Factorizer): + table: UInt32HashTable + uniques: UInt32Vector + +class Int16Factorizer(Factorizer): + table: Int16HashTable + uniques: Int16Vector + +class UInt16Factorizer(Factorizer): + table: UInt16HashTable + uniques: UInt16Vector + +class Int8Factorizer(Factorizer): + table: Int8HashTable + uniques: Int8Vector + +class UInt8Factorizer(Factorizer): + table: UInt8HashTable + uniques: UInt8Vector + +class Float64Factorizer(Factorizer): + table: Float64HashTable + uniques: Float64Vector + +class Float32Factorizer(Factorizer): + table: Float32HashTable + uniques: Float32Vector + +class Complex64Factorizer(Factorizer): + table: Complex64HashTable + uniques: Complex64Vector + +class Complex128Factorizer(Factorizer): + table: Complex128HashTable + uniques: Complex128Vector class Int64Vector: def __init__(self, *args) -> None: ... diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index e4e9b24d725c6..ccac3d0b50d45 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -79,6 +79,9 @@ cdef class Factorizer: def get_count(self) -> int: return self.count + def factorize(self, values, na_sentinel=-1, na_value=None, mask=None) -> np.ndarray: + raise NotImplementedError + cdef class ObjectFactorizer(Factorizer): cdef public: @@ -90,7 +93,7 @@ cdef class ObjectFactorizer(Factorizer): self.uniques = ObjectVector() def factorize( - self, ndarray[object] values, na_sentinel=-1, na_value=None + self, ndarray[object] values, na_sentinel=-1, na_value=None, mask=None ) -> np.ndarray: """ @@ -109,6 +112,9 @@ cdef class ObjectFactorizer(Factorizer): cdef: ndarray[intp_t] labels + if mask is not None: + raise NotImplementedError("mask not supported for ObjectFactorizer.") + if self.uniques.external_view_exists: uniques = ObjectVector() uniques.extend(self.uniques.to_array()) @@ -117,41 +123,3 @@ cdef class ObjectFactorizer(Factorizer): self.count, na_sentinel, na_value) self.count = len(self.uniques) return labels - - -cdef class Int64Factorizer(Factorizer): - cdef public: - Int64HashTable table - Int64Vector uniques - - def __cinit__(self, size_hint: int): - self.table = Int64HashTable(size_hint) - self.uniques = Int64Vector() - - def factorize(self, const int64_t[:] values, - na_sentinel=-1, na_value=None, object mask=None) -> np.ndarray: - """ - Returns - ------- - ndarray[intp_t] - - Examples - -------- - Factorize values with nans replaced by na_sentinel - - >>> fac = Int64Factorizer(3) - >>> fac.factorize(np.array([1,2,3]), na_sentinel=20) - array([0, 1, 2]) - """ - cdef: - ndarray[intp_t] labels - - if self.uniques.external_view_exists: - uniques = Int64Vector() - uniques.extend(self.uniques.to_array()) - self.uniques = uniques - labels = self.table.get_labels(values, self.uniques, - self.count, na_sentinel, - na_value=na_value, mask=mask) - self.count = len(self.uniques) - return labels diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index bda8cd83c0605..47dd0cbbd7164 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -101,6 +101,7 @@ from pandas._libs.khash cimport ( from pandas._libs.tslibs.util cimport get_c_string from pandas._libs.missing cimport C_NA + {{py: # name, dtype, c_type @@ -876,6 +877,44 @@ cdef class {{name}}HashTable(HashTable): return np.asarray(labels), arr_uniques {{endif}} + +cdef class {{name}}Factorizer(Factorizer): + cdef public: + {{name}}HashTable table + {{name}}Vector uniques + + def __cinit__(self, size_hint: int): + self.table = {{name}}HashTable(size_hint) + self.uniques = {{name}}Vector() + + def factorize(self, const {{c_type}}[:] values, + na_sentinel=-1, na_value=None, object mask=None) -> np.ndarray: + """ + Returns + ------- + ndarray[intp_t] + + Examples + -------- + Factorize values with nans replaced by na_sentinel + + >>> fac = {{name}}Factorizer(3) + >>> fac.factorize(np.array([1,2,3], dtype="{{dtype}}"), na_sentinel=20) + array([0, 1, 2]) + """ + cdef: + ndarray[intp_t] labels + + if self.uniques.external_view_exists: + uniques = {{name}}Vector() + uniques.extend(self.uniques.to_array()) + self.uniques = uniques + labels = self.table.get_labels(values, self.uniques, + self.count, na_sentinel, + na_value=na_value, mask=mask) + self.count = len(self.uniques) + return labels + {{endfor}} diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 74a1051825820..cc9a7b7f8d40b 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2362,14 +2362,8 @@ def _factorize_keys( rizer = klass(max(len(lk), len(rk))) - # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type - # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]], - # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]" - llab = rizer.factorize(lk) # type: ignore[arg-type] - # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type - # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]], - # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]" - rlab = rizer.factorize(rk) # type: ignore[arg-type] + llab = rizer.factorize(lk) + rlab = rizer.factorize(rk) assert llab.dtype == np.dtype(np.intp), llab.dtype assert rlab.dtype == np.dtype(np.intp), rlab.dtype