Skip to content

Commit f6d3cb2

Browse files
authored
ENH: Compile Factorizer class for all numeric dtypes (#49624)
* ENH: Compile Factorizer class for all numeric dtypes * Fix test * Fix test * Add factorize to base class * Remove ignores * Move factorizer
1 parent c04bba4 commit f6d3cb2

File tree

4 files changed

+97
-60
lines changed

4 files changed

+97
-60
lines changed

pandas/_libs/hashtable.pyi

+49-13
Original file line numberDiff line numberDiff line change
@@ -15,29 +15,65 @@ class Factorizer:
1515
count: int
1616
def __init__(self, size_hint: int) -> None: ...
1717
def get_count(self) -> int: ...
18-
19-
class ObjectFactorizer(Factorizer):
20-
table: PyObjectHashTable
21-
uniques: ObjectVector
2218
def factorize(
2319
self,
24-
values: npt.NDArray[np.object_],
20+
values: np.ndarray,
2521
sort: bool = ...,
2622
na_sentinel=...,
2723
na_value=...,
2824
) -> npt.NDArray[np.intp]: ...
2925

26+
class ObjectFactorizer(Factorizer):
27+
table: PyObjectHashTable
28+
uniques: ObjectVector
29+
3030
class Int64Factorizer(Factorizer):
3131
table: Int64HashTable
3232
uniques: Int64Vector
33-
def factorize(
34-
self,
35-
values: np.ndarray, # const int64_t[:]
36-
sort: bool = ...,
37-
na_sentinel=...,
38-
na_value=...,
39-
mask=...,
40-
) -> npt.NDArray[np.intp]: ...
33+
34+
class UInt64Factorizer(Factorizer):
35+
table: UInt64HashTable
36+
uniques: UInt64Vector
37+
38+
class Int32Factorizer(Factorizer):
39+
table: Int32HashTable
40+
uniques: Int32Vector
41+
42+
class UInt32Factorizer(Factorizer):
43+
table: UInt32HashTable
44+
uniques: UInt32Vector
45+
46+
class Int16Factorizer(Factorizer):
47+
table: Int16HashTable
48+
uniques: Int16Vector
49+
50+
class UInt16Factorizer(Factorizer):
51+
table: UInt16HashTable
52+
uniques: UInt16Vector
53+
54+
class Int8Factorizer(Factorizer):
55+
table: Int8HashTable
56+
uniques: Int8Vector
57+
58+
class UInt8Factorizer(Factorizer):
59+
table: UInt8HashTable
60+
uniques: UInt8Vector
61+
62+
class Float64Factorizer(Factorizer):
63+
table: Float64HashTable
64+
uniques: Float64Vector
65+
66+
class Float32Factorizer(Factorizer):
67+
table: Float32HashTable
68+
uniques: Float32Vector
69+
70+
class Complex64Factorizer(Factorizer):
71+
table: Complex64HashTable
72+
uniques: Complex64Vector
73+
74+
class Complex128Factorizer(Factorizer):
75+
table: Complex128HashTable
76+
uniques: Complex128Vector
4177

4278
class Int64Vector:
4379
def __init__(self, *args) -> None: ...

pandas/_libs/hashtable.pyx

+7-39
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,9 @@ cdef class Factorizer:
7979
def get_count(self) -> int:
8080
return self.count
8181

82+
def factorize(self, values, na_sentinel=-1, na_value=None, mask=None) -> np.ndarray:
83+
raise NotImplementedError
84+
8285

8386
cdef class ObjectFactorizer(Factorizer):
8487
cdef public:
@@ -90,7 +93,7 @@ cdef class ObjectFactorizer(Factorizer):
9093
self.uniques = ObjectVector()
9194

9295
def factorize(
93-
self, ndarray[object] values, na_sentinel=-1, na_value=None
96+
self, ndarray[object] values, na_sentinel=-1, na_value=None, mask=None
9497
) -> np.ndarray:
9598
"""
9699

@@ -109,6 +112,9 @@ cdef class ObjectFactorizer(Factorizer):
109112
cdef:
110113
ndarray[intp_t] labels
111114

115+
if mask is not None:
116+
raise NotImplementedError("mask not supported for ObjectFactorizer.")
117+
112118
if self.uniques.external_view_exists:
113119
uniques = ObjectVector()
114120
uniques.extend(self.uniques.to_array())
@@ -117,41 +123,3 @@ cdef class ObjectFactorizer(Factorizer):
117123
self.count, na_sentinel, na_value)
118124
self.count = len(self.uniques)
119125
return labels
120-
121-
122-
cdef class Int64Factorizer(Factorizer):
123-
cdef public:
124-
Int64HashTable table
125-
Int64Vector uniques
126-
127-
def __cinit__(self, size_hint: int):
128-
self.table = Int64HashTable(size_hint)
129-
self.uniques = Int64Vector()
130-
131-
def factorize(self, const int64_t[:] values,
132-
na_sentinel=-1, na_value=None, object mask=None) -> np.ndarray:
133-
"""
134-
Returns
135-
-------
136-
ndarray[intp_t]
137-
138-
Examples
139-
--------
140-
Factorize values with nans replaced by na_sentinel
141-
142-
>>> fac = Int64Factorizer(3)
143-
>>> fac.factorize(np.array([1,2,3]), na_sentinel=20)
144-
array([0, 1, 2])
145-
"""
146-
cdef:
147-
ndarray[intp_t] labels
148-
149-
if self.uniques.external_view_exists:
150-
uniques = Int64Vector()
151-
uniques.extend(self.uniques.to_array())
152-
self.uniques = uniques
153-
labels = self.table.get_labels(values, self.uniques,
154-
self.count, na_sentinel,
155-
na_value=na_value, mask=mask)
156-
self.count = len(self.uniques)
157-
return labels

pandas/_libs/hashtable_class_helper.pxi.in

+39
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ from pandas._libs.khash cimport (
101101
from pandas._libs.tslibs.util cimport get_c_string
102102
from pandas._libs.missing cimport C_NA
103103

104+
104105
{{py:
105106

106107
# name, dtype, c_type
@@ -876,6 +877,44 @@ cdef class {{name}}HashTable(HashTable):
876877
return np.asarray(labels), arr_uniques
877878
{{endif}}
878879

880+
881+
cdef class {{name}}Factorizer(Factorizer):
882+
cdef public:
883+
{{name}}HashTable table
884+
{{name}}Vector uniques
885+
886+
def __cinit__(self, size_hint: int):
887+
self.table = {{name}}HashTable(size_hint)
888+
self.uniques = {{name}}Vector()
889+
890+
def factorize(self, const {{c_type}}[:] values,
891+
na_sentinel=-1, na_value=None, object mask=None) -> np.ndarray:
892+
"""
893+
Returns
894+
-------
895+
ndarray[intp_t]
896+
897+
Examples
898+
--------
899+
Factorize values with nans replaced by na_sentinel
900+
901+
>>> fac = {{name}}Factorizer(3)
902+
>>> fac.factorize(np.array([1,2,3], dtype="{{dtype}}"), na_sentinel=20)
903+
array([0, 1, 2])
904+
"""
905+
cdef:
906+
ndarray[intp_t] labels
907+
908+
if self.uniques.external_view_exists:
909+
uniques = {{name}}Vector()
910+
uniques.extend(self.uniques.to_array())
911+
self.uniques = uniques
912+
labels = self.table.get_labels(values, self.uniques,
913+
self.count, na_sentinel,
914+
na_value=na_value, mask=mask)
915+
self.count = len(self.uniques)
916+
return labels
917+
879918
{{endfor}}
880919

881920

pandas/core/reshape/merge.py

+2-8
Original file line numberDiff line numberDiff line change
@@ -2362,14 +2362,8 @@ def _factorize_keys(
23622362

23632363
rizer = klass(max(len(lk), len(rk)))
23642364

2365-
# Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type
2366-
# "Union[ndarray[Any, dtype[signedinteger[_64Bit]]],
2367-
# ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]"
2368-
llab = rizer.factorize(lk) # type: ignore[arg-type]
2369-
# Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type
2370-
# "Union[ndarray[Any, dtype[signedinteger[_64Bit]]],
2371-
# ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]"
2372-
rlab = rizer.factorize(rk) # type: ignore[arg-type]
2365+
llab = rizer.factorize(lk)
2366+
rlab = rizer.factorize(rk)
23732367
assert llab.dtype == np.dtype(np.intp), llab.dtype
23742368
assert rlab.dtype == np.dtype(np.intp), rlab.dtype
23752369

0 commit comments

Comments
 (0)