diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index a5679af44ac06..80d7ab58dc559 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -128,10 +128,12 @@ cdef struct Int64VectorData: int64_t *data Py_ssize_t n, m -cdef class Int64Vector: +cdef class Vector: + cdef bint external_view_exists + +cdef class Int64Vector(Vector): cdef Int64VectorData *data cdef ndarray ao - cdef bint external_view_exists cdef resize(self) cpdef ndarray to_array(self) diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index b6278b3956a1d..0612acd25a5d5 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -12,34 +12,28 @@ def unique_label_indices( class Factorizer: - table: PyObjectHashTable - uniques: ObjectVector count: int def __init__(self, size_hint: int): ... def get_count(self) -> int: ... + +class ObjectFactorizer(Factorizer): + table: PyObjectHashTable + uniques: ObjectVector + def factorize( self, - values: np.ndarray, # np.ndarray[object] + values: np.ndarray, # ndarray[object] sort: bool = ..., na_sentinel=..., na_value=..., ) -> np.ndarray: ... # np.ndarray[intp] - def unique( - self, - values: np.ndarray, # np.ndarray[object] - ) -> np.ndarray: ... # np.ndarray[object] - -class Int64Factorizer: +class Int64Factorizer(Factorizer): table: Int64HashTable uniques: Int64Vector - count: int - - def __init__(self, size_hint: int): ... - def get_count(self) -> int: ... def factorize( self, @@ -240,3 +234,26 @@ def value_count_int64( np.ndarray, # np.ndarray[np.int64] np.ndarray, # np.ndarray[np.int64] ]: ... + + +def duplicated( + values: np.ndarray, + keep: Literal["last", "first", False] = ..., +) -> np.ndarray: ... # np.ndarray[bool] + +def mode(values: np.ndarray, dropna: bool) -> np.ndarray: ... + +def value_count( + values: np.ndarray, + dropna: bool, +) -> tuple[ + np.ndarray, + np.ndarray, # np.ndarray[np.int64] +]: ... + + +# arr and values should have same dtype +def ismember( + arr: np.ndarray, + values: np.ndarray, +) -> np.ndarray: ... # np.ndarray[bool] diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 4566f22be2c36..7df3f69337643 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -56,19 +56,25 @@ include "hashtable_class_helper.pxi" include "hashtable_func_helper.pxi" cdef class Factorizer: - cdef public: - PyObjectHashTable table - ObjectVector uniques + cdef readonly: Py_ssize_t count - def __init__(self, size_hint: int): - self.table = PyObjectHashTable(size_hint) - self.uniques = ObjectVector() + def __cinit__(self, size_hint: int): self.count = 0 def get_count(self) -> int: return self.count + +cdef class ObjectFactorizer(Factorizer): + cdef public: + PyObjectHashTable table + ObjectVector uniques + + def __cinit__(self, size_hint: int): + self.table = PyObjectHashTable(size_hint) + self.uniques = ObjectVector() + def factorize( self, ndarray[object] values, sort=False, na_sentinel=-1, na_value=None ) -> np.ndarray: @@ -105,24 +111,15 @@ cdef class Factorizer: self.count = len(self.uniques) return labels - def unique(self, ndarray[object] values): - # just for fun - return self.table.unique(values) - -cdef class Int64Factorizer: +cdef class Int64Factorizer(Factorizer): cdef public: Int64HashTable table Int64Vector uniques - Py_ssize_t count - def __init__(self, size_hint: int): + def __cinit__(self, size_hint: int): self.table = Int64HashTable(size_hint) self.uniques = Int64Vector() - self.count = 0 - - def get_count(self) -> int: - return self.count def factorize(self, const int64_t[:] values, sort=False, na_sentinel=-1, na_value=None) -> np.ndarray: diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 4cacd3245f9d8..6d51ea7d5de7b 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -127,6 +127,8 @@ dtypes = [('Complex128', 'complex128', 'khcomplex128_t'), {{if dtype != 'int64'}} +# Int64VectorData is defined in the .pxd file because it is needed (indirectly) +# by IntervalTree ctypedef struct {{name}}VectorData: {{c_type}} *data @@ -167,6 +169,14 @@ cdef inline bint needs_resize(vector_data *data) nogil: # Vector # ---------------------------------------------------------------------- +cdef class Vector: + # cdef readonly: + # bint external_view_exists + + def __cinit__(self): + self.external_view_exists = False + + {{py: # name, dtype, c_type @@ -187,11 +197,12 @@ dtypes = [('Complex128', 'complex128', 'khcomplex128_t'), {{for name, dtype, c_type in dtypes}} -cdef class {{name}}Vector: +cdef class {{name}}Vector(Vector): + # For int64 we have to put this declaration in the .pxd file; + # Int64Vector is the only one we need exposed for other cython files. {{if dtype != 'int64'}} cdef: - bint external_view_exists {{name}}VectorData *data ndarray ao {{endif}} @@ -201,7 +212,6 @@ cdef class {{name}}Vector: sizeof({{name}}VectorData)) if not self.data: raise MemoryError() - self.external_view_exists = False self.data.n = 0 self.data.m = _INIT_VEC_CAP self.ao = np.empty(self.data.m, dtype=np.{{dtype}}) @@ -246,17 +256,15 @@ cdef class {{name}}Vector: {{endfor}} -cdef class StringVector: +cdef class StringVector(Vector): cdef: StringVectorData *data - bint external_view_exists def __cinit__(self): self.data = PyMem_Malloc(sizeof(StringVectorData)) if not self.data: raise MemoryError() - self.external_view_exists = False self.data.n = 0 self.data.m = _INIT_VEC_CAP self.data.data = malloc(self.data.m * sizeof(char *)) @@ -314,16 +322,14 @@ cdef class StringVector: self.append(x[i]) -cdef class ObjectVector: +cdef class ObjectVector(Vector): cdef: PyObject **data Py_ssize_t n, m ndarray ao - bint external_view_exists def __cinit__(self): - self.external_view_exists = False self.n = 0 self.m = _INIT_VEC_CAP self.ao = np.empty(_INIT_VEC_CAP, dtype=object) diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 772d83e67394c..ceb473a0b06af 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -31,9 +31,9 @@ dtypes = [('Complex128', 'complex128', 'complex128', @cython.wraparound(False) @cython.boundscheck(False) {{if dtype == 'object'}} -cpdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, navalue=np.NaN): +cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, navalue=np.NaN): {{else}} -cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): +cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): {{endif}} cdef: Py_ssize_t i = 0 @@ -107,9 +107,9 @@ cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): @cython.wraparound(False) @cython.boundscheck(False) {{if dtype == 'object'}} -def duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first'): +cdef duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first'): {{else}} -def duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'): +cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'): {{endif}} cdef: int ret = 0 @@ -189,9 +189,9 @@ def duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'): @cython.wraparound(False) @cython.boundscheck(False) {{if dtype == 'object'}} -def ismember_{{dtype}}(ndarray[{{c_type}}] arr, ndarray[{{c_type}}] values): +cdef ismember_{{dtype}}(ndarray[{{c_type}}] arr, ndarray[{{c_type}}] values): {{else}} -def ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): +cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): {{endif}} """ Return boolean of values in arr on an @@ -256,9 +256,9 @@ def ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): @cython.wraparound(False) @cython.boundscheck(False) {{if dtype == 'object'}} -def mode_{{dtype}}(ndarray[{{dtype}}] values, bint dropna): +cdef mode_{{dtype}}(ndarray[{{dtype}}] values, bint dropna): {{else}} -def mode_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): +cdef mode_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): {{endif}} cdef: {{if dtype == 'object'}} @@ -310,3 +310,163 @@ def mode_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): return modes[:j + 1] {{endfor}} + + +ctypedef fused htfunc_t: + complex128_t + complex64_t + float64_t + float32_t + uint64_t + uint32_t + uint16_t + uint8_t + int64_t + int32_t + int16_t + int8_t + object + + +cpdef value_count(ndarray[htfunc_t] values, bint dropna): + if htfunc_t is object: + return value_count_object(values, dropna) + + elif htfunc_t is int8_t: + return value_count_int8(values, dropna) + elif htfunc_t is int16_t: + return value_count_int16(values, dropna) + elif htfunc_t is int32_t: + return value_count_int32(values, dropna) + elif htfunc_t is int64_t: + return value_count_int64(values, dropna) + + elif htfunc_t is uint8_t: + return value_count_uint8(values, dropna) + elif htfunc_t is uint16_t: + return value_count_uint16(values, dropna) + elif htfunc_t is uint32_t: + return value_count_uint32(values, dropna) + elif htfunc_t is uint64_t: + return value_count_uint64(values, dropna) + + elif htfunc_t is float64_t: + return value_count_float64(values, dropna) + elif htfunc_t is float32_t: + return value_count_float32(values, dropna) + + elif htfunc_t is complex128_t: + return value_count_complex128(values, dropna) + elif htfunc_t is complex64_t: + return value_count_complex64(values, dropna) + + else: + raise TypeError(values.dtype) + + +cpdef duplicated(ndarray[htfunc_t] values, object keep="first"): + if htfunc_t is object: + return duplicated_object(values, keep) + + elif htfunc_t is int8_t: + return duplicated_int8(values, keep) + elif htfunc_t is int16_t: + return duplicated_int16(values, keep) + elif htfunc_t is int32_t: + return duplicated_int32(values, keep) + elif htfunc_t is int64_t: + return duplicated_int64(values, keep) + + elif htfunc_t is uint8_t: + return duplicated_uint8(values, keep) + elif htfunc_t is uint16_t: + return duplicated_uint16(values, keep) + elif htfunc_t is uint32_t: + return duplicated_uint32(values, keep) + elif htfunc_t is uint64_t: + return duplicated_uint64(values, keep) + + elif htfunc_t is float64_t: + return duplicated_float64(values, keep) + elif htfunc_t is float32_t: + return duplicated_float32(values, keep) + + elif htfunc_t is complex128_t: + return duplicated_complex128(values, keep) + elif htfunc_t is complex64_t: + return duplicated_complex64(values, keep) + + else: + raise TypeError(values.dtype) + + +cpdef ismember(ndarray[htfunc_t] arr, ndarray[htfunc_t] values): + if htfunc_t is object: + return ismember_object(arr, values) + + elif htfunc_t is int8_t: + return ismember_int8(arr, values) + elif htfunc_t is int16_t: + return ismember_int16(arr, values) + elif htfunc_t is int32_t: + return ismember_int32(arr, values) + elif htfunc_t is int64_t: + return ismember_int64(arr, values) + + elif htfunc_t is uint8_t: + return ismember_uint8(arr, values) + elif htfunc_t is uint16_t: + return ismember_uint16(arr, values) + elif htfunc_t is uint32_t: + return ismember_uint32(arr, values) + elif htfunc_t is uint64_t: + return ismember_uint64(arr, values) + + elif htfunc_t is float64_t: + return ismember_float64(arr, values) + elif htfunc_t is float32_t: + return ismember_float32(arr, values) + + elif htfunc_t is complex128_t: + return ismember_complex128(arr, values) + elif htfunc_t is complex64_t: + return ismember_complex64(arr, values) + + else: + raise TypeError(values.dtype) + + +cpdef mode(ndarray[htfunc_t] values, bint dropna): + if htfunc_t is object: + return mode_object(values, dropna) + + elif htfunc_t is int8_t: + return mode_int8(values, dropna) + elif htfunc_t is int16_t: + return mode_int16(values, dropna) + elif htfunc_t is int32_t: + return mode_int32(values, dropna) + elif htfunc_t is int64_t: + return mode_int64(values, dropna) + + elif htfunc_t is uint8_t: + return mode_uint8(values, dropna) + elif htfunc_t is uint16_t: + return mode_uint16(values, dropna) + elif htfunc_t is uint32_t: + return mode_uint32(values, dropna) + elif htfunc_t is uint64_t: + return mode_uint64(values, dropna) + + elif htfunc_t is float64_t: + return mode_float64(values, dropna) + elif htfunc_t is float32_t: + return mode_float32(values, dropna) + + elif htfunc_t is complex128_t: + return mode_complex128(values, dropna) + elif htfunc_t is complex64_t: + return mode_complex64(values, dropna) + + else: + raise TypeError(values.dtype) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index ce718d9c9c810..f8f5e5e05bc35 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -84,6 +84,8 @@ from pandas.core.indexers import validate_indices if TYPE_CHECKING: + from typing import Literal + from pandas import ( Categorical, DataFrame, @@ -188,7 +190,7 @@ def _reconstruct_data( Parameters ---------- values : np.ndarray or ExtensionArray - dtype : np.ndtype or ExtensionDtype + dtype : np.dtype or ExtensionDtype original : AnyArrayLike Returns @@ -516,10 +518,7 @@ def f(c, v): ) values = values.astype(common, copy=False) comps = comps.astype(common, copy=False) - name = common.name - if name == "bool": - name = "uint8" - f = getattr(htable, f"ismember_{name}") + f = htable.ismember return f(comps, values) @@ -888,30 +887,24 @@ def value_counts_arraylike(values, dropna: bool): values = _ensure_arraylike(values) original = values values, _ = _ensure_data(values) - ndtype = values.dtype.name + + # TODO: handle uint8 + keys, counts = htable.value_count(values, dropna) if needs_i8_conversion(original.dtype): # datetime, timedelta, or period - keys, counts = htable.value_count_int64(values, dropna) - if dropna: msk = keys != iNaT keys, counts = keys[msk], counts[msk] - else: - # ndarray like - - # TODO: handle uint8 - f = getattr(htable, f"value_count_{ndtype}") - keys, counts = f(values, dropna) - res_keys = _reconstruct_data(keys, original.dtype, original) - return res_keys, counts -def duplicated(values: ArrayLike, keep: str | bool = "first") -> np.ndarray: +def duplicated( + values: ArrayLike, keep: Literal["first", "last", False] = "first" +) -> np.ndarray: """ Return boolean ndarray denoting duplicate values. @@ -931,9 +924,7 @@ def duplicated(values: ArrayLike, keep: str | bool = "first") -> np.ndarray: duplicated : ndarray[bool] """ values, _ = _ensure_data(values) - ndtype = values.dtype.name - f = getattr(htable, f"duplicated_{ndtype}") - return f(values, keep=keep) + return htable.duplicated(values, keep=keep) def mode(values, dropna: bool = True) -> Series: @@ -971,16 +962,14 @@ def mode(values, dropna: bool = True) -> Series: values = values[~mask] values, _ = _ensure_data(values) - ndtype = values.dtype.name - f = getattr(htable, f"mode_{ndtype}") - result = f(values, dropna=dropna) + npresult = htable.mode(values, dropna=dropna) try: - result = np.sort(result) + npresult = np.sort(npresult) except TypeError as err: warn(f"Unable to sort modes: {err}") - result = _reconstruct_data(result, original.dtype, original) + result = _reconstruct_data(npresult, original.dtype, original) # Ensure index is type stable (should always use int index) return Series(result, index=ibase.default_index(len(result))) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index a82c75f4b2557..26c582561cd3d 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2183,11 +2183,9 @@ def mode(self, dropna=True): if dropna: good = self._codes != -1 codes = self._codes[good] - # error: Incompatible types in assignment (expression has type "List[Any]", - # variable has type "ndarray") - codes = sorted( # type: ignore[assignment] - htable.mode_int64(ensure_int64(codes), dropna) - ) + + codes = htable.mode(codes, dropna) + codes.sort() codes = coerce_indexer_dtype(codes, self.dtype.categories) return self._from_backing_data(codes) diff --git a/pandas/core/base.py b/pandas/core/base.py index 3270e3dd82f7d..adc904d80fea8 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -61,6 +61,8 @@ import pandas.core.nanops as nanops if TYPE_CHECKING: + from typing import Literal + from pandas import Categorical _shared_docs: dict[str, str] = {} @@ -1258,5 +1260,7 @@ def drop_duplicates(self, keep="first"): return self[~duplicated] # type: ignore[index] @final - def _duplicated(self, keep: str | bool = "first") -> np.ndarray: + def _duplicated( + self, keep: Literal["first", "last", False] = "first" + ) -> np.ndarray: return duplicated(self._values, keep=keep) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6d3042507d930..899526694f4d9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -43,6 +43,7 @@ lib, properties, ) +from pandas._libs.hashtable import duplicated from pandas._libs.lib import no_default from pandas._typing import ( AggFuncType, @@ -6141,7 +6142,6 @@ def duplicated( 4 True dtype: bool """ - from pandas._libs.hashtable import duplicated_int64 if self.empty: return self._constructor_sliced(dtype=bool) @@ -6181,7 +6181,7 @@ def f(vals) -> tuple[np.ndarray, int]: sort=False, xnull=False, ) - result = self._constructor_sliced(duplicated_int64(ids, keep), index=self.index) + result = self._constructor_sliced(duplicated(ids, keep), index=self.index) return result.__finalize__(self, method="duplicated") # ---------------------------------------------------------------------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 84f1245299d53..5895d12622aa1 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2685,7 +2685,7 @@ def drop_duplicates(self: _IndexT, keep: str_t | bool = "first") -> _IndexT: return super().drop_duplicates(keep=keep) - def duplicated(self, keep: str_t | bool = "first") -> np.ndarray: + def duplicated(self, keep: Literal["first", "last", False] = "first") -> np.ndarray: """ Indicate duplicate index values. diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index a68238af003e4..4e4bcd570391d 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -25,7 +25,7 @@ index as libindex, lib, ) -from pandas._libs.hashtable import duplicated_int64 +from pandas._libs.hashtable import duplicated from pandas._typing import ( AnyArrayLike, DtypeObj, @@ -1614,7 +1614,7 @@ def duplicated(self, keep="first") -> np.ndarray: shape = tuple(len(lev) for lev in self.levels) ids = get_group_index(self.codes, shape, sort=False, xnull=False) - return duplicated_int64(ids, keep) + return duplicated(ids, keep) # error: Cannot override final attribute "_duplicated" # (previously declared in base class "IndexOpsMixin") diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 8478e2a17efa5..f8085b2bab1ed 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2153,7 +2153,7 @@ def _factorize_keys( rk = ensure_int64(np.asarray(rk, dtype=np.int64)) else: - klass = libhashtable.Factorizer + klass = libhashtable.ObjectFactorizer lk = ensure_object(lk) rk = ensure_object(rk) diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py index bc0b6e0b028a8..ea59d55989f8b 100644 --- a/pandas/tests/indexes/multi/test_duplicates.py +++ b/pandas/tests/indexes/multi/test_duplicates.py @@ -253,7 +253,7 @@ def test_duplicated_large(keep): mi = MultiIndex(levels=levels, codes=codes) result = mi.duplicated(keep=keep) - expected = hashtable.duplicated_object(mi.values, keep=keep) + expected = hashtable.duplicated(mi.values, keep=keep) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 04a8aeefbfcd6..aeff591e3f0dc 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -278,7 +278,7 @@ def test_unique(self, table_type, dtype): def get_ht_function(fun_name, type_suffix): - return getattr(ht, fun_name + "_" + type_suffix) + return getattr(ht, fun_name) @pytest.mark.parametrize( @@ -374,7 +374,7 @@ def test_modes_with_nans(): values = np.array([True, pd.NA, np.nan], dtype=np.object_) # pd.Na and np.nan will have the same representative: np.nan # thus we have 2 nans and 1 True - modes = ht.mode_object(values, False) + modes = ht.mode(values, False) assert modes.size == 1 assert np.isnan(modes[0]) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 964dd9bdd0e0a..4df95d895e475 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -194,7 +194,7 @@ def test_factorize_nan(self): # rizer.factorize should not raise an exception if na_sentinel indexes # outside of reverse_indexer key = np.array([1, 2, 1, np.nan], dtype="O") - rizer = ht.Factorizer(len(key)) + rizer = ht.ObjectFactorizer(len(key)) for na_sentinel in (-1, 20): ids = rizer.factorize(key, sort=True, na_sentinel=na_sentinel) expected = np.array([0, 1, 0, na_sentinel], dtype="int32")