@@ -46,12 +46,10 @@ include "hashtable_func_helper.pxi"
46
46
47
47
cdef class Factorizer:
48
48
cdef public PyObjectHashTable table
49
- cdef public ObjectVector uniques
50
49
cdef public Py_ssize_t count
51
50
52
51
def __init__ (self , size_hint ):
53
52
self .table = PyObjectHashTable(size_hint)
54
- self .uniques = ObjectVector()
55
53
self .count = 0
56
54
57
55
def get_count (self ):
@@ -64,19 +62,22 @@ cdef class Factorizer:
64
62
>>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
65
63
array([ 0, 1, 20])
66
64
"""
67
- labels = self .table.get_labels(values, self .uniques,
65
+ uniques = ObjectVector()
66
+ labels = self .table.get_labels(values, uniques,
68
67
self .count, na_sentinel, check_null)
69
68
mask = (labels == na_sentinel)
69
+ if len (labels) == 0 :
70
+ return labels
70
71
# sort on
71
72
if sort:
72
73
if labels.dtype != np.intp:
73
74
labels = labels.astype(np.intp)
74
- sorter = self . uniques.to_array().argsort()
75
+ sorter = uniques.to_array().argsort()
75
76
reverse_indexer = np.empty(len (sorter), dtype = np.intp)
76
77
reverse_indexer.put(sorter, np.arange(len (sorter)))
77
78
labels = reverse_indexer.take(labels, mode = ' clip' )
78
79
labels[mask] = na_sentinel
79
- self .count = len (self . uniques)
80
+ self .count = len (uniques)
80
81
return labels
81
82
82
83
def unique (self , ndarray[object] values ):
@@ -86,35 +87,36 @@ cdef class Factorizer:
86
87
87
88
cdef class Int64Factorizer:
88
89
cdef public Int64HashTable table
89
- cdef public Int64Vector uniques
90
90
cdef public Py_ssize_t count
91
91
92
92
def __init__ (self , size_hint ):
93
93
self .table = Int64HashTable(size_hint)
94
- self .uniques = Int64Vector()
95
94
self .count = 0
96
95
97
96
def get_count (self ):
98
97
return self .count
99
98
100
99
def factorize (self , int64_t[:] values , sort = False ,
101
100
na_sentinel = - 1 , check_null = True ):
101
+ uniques = Int64Vector()
102
102
labels = self .table.get_labels(values, self .uniques,
103
103
self .count, na_sentinel,
104
104
check_null)
105
105
106
106
# sort on
107
+ if len (labels) == 0 :
108
+ return labels
107
109
if sort:
108
110
if labels.dtype != np.intp:
109
111
labels = labels.astype(np.intp)
110
112
111
- sorter = self . uniques.to_array().argsort()
113
+ sorter = uniques.to_array().argsort()
112
114
reverse_indexer = np.empty(len (sorter), dtype = np.intp)
113
115
reverse_indexer.put(sorter, np.arange(len (sorter)))
114
116
115
117
labels = reverse_indexer.take(labels)
116
118
117
- self .count = len (self . uniques)
119
+ self .count = len (uniques)
118
120
return labels
119
121
120
122
0 commit comments