Skip to content

Commit cfbd076

Browse files
authored
REF: handle dtype dispatch in libhashtable, share Vector/Hashtable code (#40944)
1 parent 59df6a8 commit cfbd076

15 files changed

+263
-90
lines changed

pandas/_libs/hashtable.pxd

+4-2
Original file line numberDiff line numberDiff line change
@@ -128,10 +128,12 @@ cdef struct Int64VectorData:
128128
int64_t *data
129129
Py_ssize_t n, m
130130

131-
cdef class Int64Vector:
131+
cdef class Vector:
132+
cdef bint external_view_exists
133+
134+
cdef class Int64Vector(Vector):
132135
cdef Int64VectorData *data
133136
cdef ndarray ao
134-
cdef bint external_view_exists
135137

136138
cdef resize(self)
137139
cpdef ndarray to_array(self)

pandas/_libs/hashtable.pyi

+30-13
Original file line numberDiff line numberDiff line change
@@ -12,34 +12,28 @@ def unique_label_indices(
1212

1313

1414
class Factorizer:
15-
table: PyObjectHashTable
16-
uniques: ObjectVector
1715
count: int
1816

1917
def __init__(self, size_hint: int): ...
2018
def get_count(self) -> int: ...
2119

20+
21+
class ObjectFactorizer(Factorizer):
22+
table: PyObjectHashTable
23+
uniques: ObjectVector
24+
2225
def factorize(
2326
self,
24-
values: np.ndarray, # np.ndarray[object]
27+
values: np.ndarray, # ndarray[object]
2528
sort: bool = ...,
2629
na_sentinel=...,
2730
na_value=...,
2831
) -> np.ndarray: ... # np.ndarray[intp]
2932

30-
def unique(
31-
self,
32-
values: np.ndarray, # np.ndarray[object]
33-
) -> np.ndarray: ... # np.ndarray[object]
34-
3533

36-
class Int64Factorizer:
34+
class Int64Factorizer(Factorizer):
3735
table: Int64HashTable
3836
uniques: Int64Vector
39-
count: int
40-
41-
def __init__(self, size_hint: int): ...
42-
def get_count(self) -> int: ...
4337

4438
def factorize(
4539
self,
@@ -240,3 +234,26 @@ def value_count_int64(
240234
np.ndarray, # np.ndarray[np.int64]
241235
np.ndarray, # np.ndarray[np.int64]
242236
]: ...
237+
238+
239+
def duplicated(
240+
values: np.ndarray,
241+
keep: Literal["last", "first", False] = ...,
242+
) -> np.ndarray: ... # np.ndarray[bool]
243+
244+
def mode(values: np.ndarray, dropna: bool) -> np.ndarray: ...
245+
246+
def value_count(
247+
values: np.ndarray,
248+
dropna: bool,
249+
) -> tuple[
250+
np.ndarray,
251+
np.ndarray, # np.ndarray[np.int64]
252+
]: ...
253+
254+
255+
# arr and values should have same dtype
256+
def ismember(
257+
arr: np.ndarray,
258+
values: np.ndarray,
259+
) -> np.ndarray: ... # np.ndarray[bool]

pandas/_libs/hashtable.pyx

+14-17
Original file line numberDiff line numberDiff line change
@@ -56,19 +56,25 @@ include "hashtable_class_helper.pxi"
5656
include "hashtable_func_helper.pxi"
5757

5858
cdef class Factorizer:
59-
cdef public:
60-
PyObjectHashTable table
61-
ObjectVector uniques
59+
cdef readonly:
6260
Py_ssize_t count
6361

64-
def __init__(self, size_hint: int):
65-
self.table = PyObjectHashTable(size_hint)
66-
self.uniques = ObjectVector()
62+
def __cinit__(self, size_hint: int):
6763
self.count = 0
6864

6965
def get_count(self) -> int:
7066
return self.count
7167

68+
69+
cdef class ObjectFactorizer(Factorizer):
70+
cdef public:
71+
PyObjectHashTable table
72+
ObjectVector uniques
73+
74+
def __cinit__(self, size_hint: int):
75+
self.table = PyObjectHashTable(size_hint)
76+
self.uniques = ObjectVector()
77+
7278
def factorize(
7379
self, ndarray[object] values, sort=False, na_sentinel=-1, na_value=None
7480
) -> np.ndarray:
@@ -105,24 +111,15 @@ cdef class Factorizer:
105111
self.count = len(self.uniques)
106112
return labels
107113

108-
def unique(self, ndarray[object] values):
109-
# just for fun
110-
return self.table.unique(values)
111114

112-
113-
cdef class Int64Factorizer:
115+
cdef class Int64Factorizer(Factorizer):
114116
cdef public:
115117
Int64HashTable table
116118
Int64Vector uniques
117-
Py_ssize_t count
118119

119-
def __init__(self, size_hint: int):
120+
def __cinit__(self, size_hint: int):
120121
self.table = Int64HashTable(size_hint)
121122
self.uniques = Int64Vector()
122-
self.count = 0
123-
124-
def get_count(self) -> int:
125-
return self.count
126123

127124
def factorize(self, const int64_t[:] values, sort=False,
128125
na_sentinel=-1, na_value=None) -> np.ndarray:

pandas/_libs/hashtable_class_helper.pxi.in

+15-9
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,8 @@ dtypes = [('Complex128', 'complex128', 'khcomplex128_t'),
127127

128128

129129
{{if dtype != 'int64'}}
130+
# Int64VectorData is defined in the .pxd file because it is needed (indirectly)
131+
# by IntervalTree
130132

131133
ctypedef struct {{name}}VectorData:
132134
{{c_type}} *data
@@ -167,6 +169,14 @@ cdef inline bint needs_resize(vector_data *data) nogil:
167169
# Vector
168170
# ----------------------------------------------------------------------
169171

172+
cdef class Vector:
173+
# cdef readonly:
174+
# bint external_view_exists
175+
176+
def __cinit__(self):
177+
self.external_view_exists = False
178+
179+
170180
{{py:
171181

172182
# name, dtype, c_type
@@ -187,11 +197,12 @@ dtypes = [('Complex128', 'complex128', 'khcomplex128_t'),
187197

188198
{{for name, dtype, c_type in dtypes}}
189199

190-
cdef class {{name}}Vector:
200+
cdef class {{name}}Vector(Vector):
191201

202+
# For int64 we have to put this declaration in the .pxd file;
203+
# Int64Vector is the only one we need exposed for other cython files.
192204
{{if dtype != 'int64'}}
193205
cdef:
194-
bint external_view_exists
195206
{{name}}VectorData *data
196207
ndarray ao
197208
{{endif}}
@@ -201,7 +212,6 @@ cdef class {{name}}Vector:
201212
sizeof({{name}}VectorData))
202213
if not self.data:
203214
raise MemoryError()
204-
self.external_view_exists = False
205215
self.data.n = 0
206216
self.data.m = _INIT_VEC_CAP
207217
self.ao = np.empty(self.data.m, dtype=np.{{dtype}})
@@ -246,17 +256,15 @@ cdef class {{name}}Vector:
246256

247257
{{endfor}}
248258

249-
cdef class StringVector:
259+
cdef class StringVector(Vector):
250260

251261
cdef:
252262
StringVectorData *data
253-
bint external_view_exists
254263

255264
def __cinit__(self):
256265
self.data = <StringVectorData *>PyMem_Malloc(sizeof(StringVectorData))
257266
if not self.data:
258267
raise MemoryError()
259-
self.external_view_exists = False
260268
self.data.n = 0
261269
self.data.m = _INIT_VEC_CAP
262270
self.data.data = <char **>malloc(self.data.m * sizeof(char *))
@@ -314,16 +322,14 @@ cdef class StringVector:
314322
self.append(x[i])
315323

316324

317-
cdef class ObjectVector:
325+
cdef class ObjectVector(Vector):
318326

319327
cdef:
320328
PyObject **data
321329
Py_ssize_t n, m
322330
ndarray ao
323-
bint external_view_exists
324331

325332
def __cinit__(self):
326-
self.external_view_exists = False
327333
self.n = 0
328334
self.m = _INIT_VEC_CAP
329335
self.ao = np.empty(_INIT_VEC_CAP, dtype=object)

0 commit comments

Comments
 (0)