From 7ff424e6d9699f19d514ca5417d49d70b8f1b2ca Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 14 Apr 2021 07:21:18 -0700 Subject: [PATCH 1/2] REF: handle dtype dispatch in libhashtable --- pandas/_libs/hashtable_func_helper.pxi.in | 176 +++++++++++++++++- pandas/core/algorithms.py | 28 +-- pandas/core/arrays/categorical.py | 8 +- pandas/core/frame.py | 4 +- pandas/core/indexes/multi.py | 4 +- pandas/core/reshape/merge.py | 3 +- pandas/tests/indexes/multi/test_duplicates.py | 2 +- pandas/tests/libs/test_hashtable.py | 4 +- pandas/tests/test_algos.py | 2 +- 9 files changed, 188 insertions(+), 43 deletions(-) diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 772d83e67394c..ceb473a0b06af 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -31,9 +31,9 @@ dtypes = [('Complex128', 'complex128', 'complex128', @cython.wraparound(False) @cython.boundscheck(False) {{if dtype == 'object'}} -cpdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, navalue=np.NaN): +cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, navalue=np.NaN): {{else}} -cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): +cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): {{endif}} cdef: Py_ssize_t i = 0 @@ -107,9 +107,9 @@ cpdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): @cython.wraparound(False) @cython.boundscheck(False) {{if dtype == 'object'}} -def duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first'): +cdef duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first'): {{else}} -def duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'): +cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'): {{endif}} cdef: int ret = 0 @@ -189,9 +189,9 @@ def duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'): @cython.wraparound(False) @cython.boundscheck(False) {{if dtype == 'object'}} -def ismember_{{dtype}}(ndarray[{{c_type}}] arr, ndarray[{{c_type}}] values): +cdef ismember_{{dtype}}(ndarray[{{c_type}}] arr, ndarray[{{c_type}}] values): {{else}} -def ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): +cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): {{endif}} """ Return boolean of values in arr on an @@ -256,9 +256,9 @@ def ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values): @cython.wraparound(False) @cython.boundscheck(False) {{if dtype == 'object'}} -def mode_{{dtype}}(ndarray[{{dtype}}] values, bint dropna): +cdef mode_{{dtype}}(ndarray[{{dtype}}] values, bint dropna): {{else}} -def mode_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): +cdef mode_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): {{endif}} cdef: {{if dtype == 'object'}} @@ -310,3 +310,163 @@ def mode_{{dtype}}(const {{dtype}}_t[:] values, bint dropna): return modes[:j + 1] {{endfor}} + + +ctypedef fused htfunc_t: + complex128_t + complex64_t + float64_t + float32_t + uint64_t + uint32_t + uint16_t + uint8_t + int64_t + int32_t + int16_t + int8_t + object + + +cpdef value_count(ndarray[htfunc_t] values, bint dropna): + if htfunc_t is object: + return value_count_object(values, dropna) + + elif htfunc_t is int8_t: + return value_count_int8(values, dropna) + elif htfunc_t is int16_t: + return value_count_int16(values, dropna) + elif htfunc_t is int32_t: + return value_count_int32(values, dropna) + elif htfunc_t is int64_t: + return value_count_int64(values, dropna) + + elif htfunc_t is uint8_t: + return value_count_uint8(values, dropna) + elif htfunc_t is uint16_t: + return value_count_uint16(values, dropna) + elif htfunc_t is uint32_t: + return value_count_uint32(values, dropna) + elif htfunc_t is uint64_t: + return value_count_uint64(values, dropna) + + elif htfunc_t is float64_t: + return value_count_float64(values, dropna) + elif htfunc_t is float32_t: + return value_count_float32(values, dropna) + + elif htfunc_t is complex128_t: + return value_count_complex128(values, dropna) + elif htfunc_t is complex64_t: + return value_count_complex64(values, dropna) + + else: + raise TypeError(values.dtype) + + +cpdef duplicated(ndarray[htfunc_t] values, object keep="first"): + if htfunc_t is object: + return duplicated_object(values, keep) + + elif htfunc_t is int8_t: + return duplicated_int8(values, keep) + elif htfunc_t is int16_t: + return duplicated_int16(values, keep) + elif htfunc_t is int32_t: + return duplicated_int32(values, keep) + elif htfunc_t is int64_t: + return duplicated_int64(values, keep) + + elif htfunc_t is uint8_t: + return duplicated_uint8(values, keep) + elif htfunc_t is uint16_t: + return duplicated_uint16(values, keep) + elif htfunc_t is uint32_t: + return duplicated_uint32(values, keep) + elif htfunc_t is uint64_t: + return duplicated_uint64(values, keep) + + elif htfunc_t is float64_t: + return duplicated_float64(values, keep) + elif htfunc_t is float32_t: + return duplicated_float32(values, keep) + + elif htfunc_t is complex128_t: + return duplicated_complex128(values, keep) + elif htfunc_t is complex64_t: + return duplicated_complex64(values, keep) + + else: + raise TypeError(values.dtype) + + +cpdef ismember(ndarray[htfunc_t] arr, ndarray[htfunc_t] values): + if htfunc_t is object: + return ismember_object(arr, values) + + elif htfunc_t is int8_t: + return ismember_int8(arr, values) + elif htfunc_t is int16_t: + return ismember_int16(arr, values) + elif htfunc_t is int32_t: + return ismember_int32(arr, values) + elif htfunc_t is int64_t: + return ismember_int64(arr, values) + + elif htfunc_t is uint8_t: + return ismember_uint8(arr, values) + elif htfunc_t is uint16_t: + return ismember_uint16(arr, values) + elif htfunc_t is uint32_t: + return ismember_uint32(arr, values) + elif htfunc_t is uint64_t: + return ismember_uint64(arr, values) + + elif htfunc_t is float64_t: + return ismember_float64(arr, values) + elif htfunc_t is float32_t: + return ismember_float32(arr, values) + + elif htfunc_t is complex128_t: + return ismember_complex128(arr, values) + elif htfunc_t is complex64_t: + return ismember_complex64(arr, values) + + else: + raise TypeError(values.dtype) + + +cpdef mode(ndarray[htfunc_t] values, bint dropna): + if htfunc_t is object: + return mode_object(values, dropna) + + elif htfunc_t is int8_t: + return mode_int8(values, dropna) + elif htfunc_t is int16_t: + return mode_int16(values, dropna) + elif htfunc_t is int32_t: + return mode_int32(values, dropna) + elif htfunc_t is int64_t: + return mode_int64(values, dropna) + + elif htfunc_t is uint8_t: + return mode_uint8(values, dropna) + elif htfunc_t is uint16_t: + return mode_uint16(values, dropna) + elif htfunc_t is uint32_t: + return mode_uint32(values, dropna) + elif htfunc_t is uint64_t: + return mode_uint64(values, dropna) + + elif htfunc_t is float64_t: + return mode_float64(values, dropna) + elif htfunc_t is float32_t: + return mode_float32(values, dropna) + + elif htfunc_t is complex128_t: + return mode_complex128(values, dropna) + elif htfunc_t is complex64_t: + return mode_complex64(values, dropna) + + else: + raise TypeError(values.dtype) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9e2dd846f0379..6a5d1470a2c9b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -215,7 +215,7 @@ def _reconstruct_data( Parameters ---------- values : np.ndarray or ExtensionArray - dtype : np.ndtype or ExtensionDtype + dtype : np.dtype or ExtensionDtype original : AnyArrayLike Returns @@ -519,10 +519,7 @@ def f(c, v): ) values = values.astype(common, copy=False) comps = comps.astype(common, copy=False) - name = common.name - if name == "bool": - name = "uint8" - f = getattr(htable, f"ismember_{name}") + f = htable.ismember return f(comps, values) @@ -891,24 +888,17 @@ def value_counts_arraylike(values, dropna: bool): values = _ensure_arraylike(values) original = values values, _ = _ensure_data(values) - ndtype = values.dtype.name + + # TODO: handle uint8 + keys, counts = htable.value_count(values, dropna) if needs_i8_conversion(original.dtype): # datetime, timedelta, or period - keys, counts = htable.value_count_int64(values, dropna) - if dropna: msk = keys != iNaT keys, counts = keys[msk], counts[msk] - else: - # ndarray like - - # TODO: handle uint8 - f = getattr(htable, f"value_count_{ndtype}") - keys, counts = f(values, dropna) - keys = _reconstruct_data(keys, original.dtype, original) return keys, counts @@ -934,9 +924,7 @@ def duplicated(values: ArrayLike, keep: str | bool = "first") -> np.ndarray: duplicated : ndarray[bool] """ values, _ = _ensure_data(values) - ndtype = values.dtype.name - f = getattr(htable, f"duplicated_{ndtype}") - return f(values, keep=keep) + return htable.duplicated(values, keep=keep) def mode(values, dropna: bool = True) -> Series: @@ -974,10 +962,8 @@ def mode(values, dropna: bool = True) -> Series: values = values[~mask] values, _ = _ensure_data(values) - ndtype = values.dtype.name - f = getattr(htable, f"mode_{ndtype}") - result = f(values, dropna=dropna) + result = htable.mode(values, dropna=dropna) try: result = np.sort(result) except TypeError as err: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index f2b5ad447a0cf..855930d616eb2 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2113,11 +2113,9 @@ def mode(self, dropna=True): if dropna: good = self._codes != -1 codes = self._codes[good] - # error: Incompatible types in assignment (expression has type "List[Any]", - # variable has type "ndarray") - codes = sorted( # type: ignore[assignment] - htable.mode_int64(ensure_int64(codes), dropna) - ) + + codes = htable.mode(codes, dropna) + codes.sort() codes = coerce_indexer_dtype(codes, self.dtype.categories) return self._from_backing_data(codes) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 045776c3f5c50..42a13e5956b3d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -43,6 +43,7 @@ lib, properties, ) +from pandas._libs.hashtable import duplicated from pandas._libs.lib import no_default from pandas._typing import ( AggFuncType, @@ -6022,7 +6023,6 @@ def duplicated( 4 True dtype: bool """ - from pandas._libs.hashtable import duplicated_int64 if self.empty: return self._constructor_sliced(dtype=bool) @@ -6055,7 +6055,7 @@ def f(vals): labels, shape = map(list, zip(*map(f, vals))) ids = get_group_index(labels, shape, sort=False, xnull=False) - result = self._constructor_sliced(duplicated_int64(ids, keep), index=self.index) + result = self._constructor_sliced(duplicated(ids, keep), index=self.index) return result.__finalize__(self, method="duplicated") # ---------------------------------------------------------------------- diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 3305610a4022e..f551823717a41 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -25,7 +25,7 @@ index as libindex, lib, ) -from pandas._libs.hashtable import duplicated_int64 +from pandas._libs.hashtable import duplicated from pandas._typing import ( AnyArrayLike, DtypeObj, @@ -1614,7 +1614,7 @@ def duplicated(self, keep="first") -> np.ndarray: shape = map(len, self.levels) ids = get_group_index(self.codes, shape, sort=False, xnull=False) - return duplicated_int64(ids, keep) + return duplicated(ids, keep) # error: Cannot override final attribute "_duplicated" # (previously declared in base class "IndexOpsMixin") diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 8cee0dd2abb88..553d305ba4977 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2138,6 +2138,7 @@ def _factorize_keys( # "_values_for_factorize" rk, _ = rk._values_for_factorize() # type: ignore[union-attr,assignment] + klass: type[libhashtable.Factorizer] if is_integer_dtype(lk.dtype) and is_integer_dtype(rk.dtype): # GH#23917 TODO: needs tests for case where lk is integer-dtype # and rk is datetime-dtype @@ -2152,7 +2153,7 @@ def _factorize_keys( rk = ensure_int64(np.asarray(rk, dtype=np.int64)) else: - klass = libhashtable.Factorizer + klass = libhashtable.ObjectFactorizer lk = ensure_object(lk) rk = ensure_object(rk) diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py index bc0b6e0b028a8..ea59d55989f8b 100644 --- a/pandas/tests/indexes/multi/test_duplicates.py +++ b/pandas/tests/indexes/multi/test_duplicates.py @@ -253,7 +253,7 @@ def test_duplicated_large(keep): mi = MultiIndex(levels=levels, codes=codes) result = mi.duplicated(keep=keep) - expected = hashtable.duplicated_object(mi.values, keep=keep) + expected = hashtable.duplicated(mi.values, keep=keep) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 04a8aeefbfcd6..aeff591e3f0dc 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -278,7 +278,7 @@ def test_unique(self, table_type, dtype): def get_ht_function(fun_name, type_suffix): - return getattr(ht, fun_name + "_" + type_suffix) + return getattr(ht, fun_name) @pytest.mark.parametrize( @@ -374,7 +374,7 @@ def test_modes_with_nans(): values = np.array([True, pd.NA, np.nan], dtype=np.object_) # pd.Na and np.nan will have the same representative: np.nan # thus we have 2 nans and 1 True - modes = ht.mode_object(values, False) + modes = ht.mode(values, False) assert modes.size == 1 assert np.isnan(modes[0]) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 127baae6e9352..e73aaecee5d4a 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -191,7 +191,7 @@ def test_factorize_nan(self): # rizer.factorize should not raise an exception if na_sentinel indexes # outside of reverse_indexer key = np.array([1, 2, 1, np.nan], dtype="O") - rizer = ht.Factorizer(len(key)) + rizer = ht.ObjectFactorizer(len(key)) for na_sentinel in (-1, 20): ids = rizer.factorize(key, sort=True, na_sentinel=na_sentinel) expected = np.array([0, 1, 0, na_sentinel], dtype="int32") From 39187135b5ea2d8e7eca2eaf330e11720c0e8a5e Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 14 Apr 2021 07:21:56 -0700 Subject: [PATCH 2/2] REF: Share Factorizer/Vector code --- pandas/_libs/hashtable.pxd | 6 +++-- pandas/_libs/hashtable.pyx | 31 ++++++++++------------ pandas/_libs/hashtable_class_helper.pxi.in | 24 ++++++++++------- 3 files changed, 33 insertions(+), 28 deletions(-) diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index a5679af44ac06..80d7ab58dc559 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -128,10 +128,12 @@ cdef struct Int64VectorData: int64_t *data Py_ssize_t n, m -cdef class Int64Vector: +cdef class Vector: + cdef bint external_view_exists + +cdef class Int64Vector(Vector): cdef Int64VectorData *data cdef ndarray ao - cdef bint external_view_exists cdef resize(self) cpdef ndarray to_array(self) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 1e2a336f12444..b23daf49d4a91 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -56,19 +56,25 @@ include "hashtable_class_helper.pxi" include "hashtable_func_helper.pxi" cdef class Factorizer: - cdef public: - PyObjectHashTable table - ObjectVector uniques + cdef readonly: Py_ssize_t count - def __init__(self, size_hint: int): - self.table = PyObjectHashTable(size_hint) - self.uniques = ObjectVector() + def __cinit__(self, size_hint: int): self.count = 0 def get_count(self) -> int: return self.count + +cdef class ObjectFactorizer(Factorizer): + cdef public: + PyObjectHashTable table + ObjectVector uniques + + def __cinit__(self, size_hint: int): + self.table = PyObjectHashTable(size_hint) + self.uniques = ObjectVector() + def factorize( self, ndarray[object] values, sort=False, na_sentinel=-1, na_value=None ) -> np.ndarray: @@ -105,24 +111,15 @@ cdef class Factorizer: self.count = len(self.uniques) return labels - def unique(self, ndarray[object] values): - # just for fun - return self.table.unique(values) - -cdef class Int64Factorizer: +cdef class Int64Factorizer(Factorizer): cdef public: Int64HashTable table Int64Vector uniques - Py_ssize_t count - def __init__(self, size_hint: int): + def __cinit__(self, size_hint: int): self.table = Int64HashTable(size_hint) self.uniques = Int64Vector() - self.count = 0 - - def get_count(self) -> int: - return self.count def factorize(self, const int64_t[:] values, sort=False, na_sentinel=-1, na_value=None) -> np.ndarray: diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index b80a127be970d..3745a1957fd3a 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -127,6 +127,8 @@ dtypes = [('Complex128', 'complex128', 'khcomplex128_t'), {{if dtype != 'int64'}} +# Int64VectorData is defined in the .pxd file because it is needed (indirectly) +# by IntervalTree ctypedef struct {{name}}VectorData: {{c_type}} *data @@ -167,6 +169,14 @@ cdef inline bint needs_resize(vector_data *data) nogil: # Vector # ---------------------------------------------------------------------- +cdef class Vector: + # cdef readonly: + # bint external_view_exists + + def __cinit__(self): + self.external_view_exists = False + + {{py: # name, dtype, c_type @@ -187,11 +197,12 @@ dtypes = [('Complex128', 'complex128', 'khcomplex128_t'), {{for name, dtype, c_type in dtypes}} -cdef class {{name}}Vector: +cdef class {{name}}Vector(Vector): + # For int64 we have to put this declaration in the .pxd file; + # Int64Vector is the only one we need exposed for other cython files. {{if dtype != 'int64'}} cdef: - bint external_view_exists {{name}}VectorData *data ndarray ao {{endif}} @@ -201,7 +212,6 @@ cdef class {{name}}Vector: sizeof({{name}}VectorData)) if not self.data: raise MemoryError() - self.external_view_exists = False self.data.n = 0 self.data.m = _INIT_VEC_CAP self.ao = np.empty(self.data.m, dtype=np.{{dtype}}) @@ -246,17 +256,15 @@ cdef class {{name}}Vector: {{endfor}} -cdef class StringVector: +cdef class StringVector(Vector): cdef: StringVectorData *data - bint external_view_exists def __cinit__(self): self.data = PyMem_Malloc(sizeof(StringVectorData)) if not self.data: raise MemoryError() - self.external_view_exists = False self.data.n = 0 self.data.m = _INIT_VEC_CAP self.data.data = malloc(self.data.m * sizeof(char *)) @@ -314,16 +322,14 @@ cdef class StringVector: self.append(x[i]) -cdef class ObjectVector: +cdef class ObjectVector(Vector): cdef: PyObject **data Py_ssize_t n, m ndarray ao - bint external_view_exists def __cinit__(self): - self.external_view_exists = False self.n = 0 self.m = _INIT_VEC_CAP self.ao = np.empty(_INIT_VEC_CAP, dtype=object)