Skip to content

Commit e221dd8

Browse files
committed
COMPAT: add refcheck kwarg and percolate out to app-level for PyPy
1 parent f8b25c2 commit e221dd8

File tree

5 files changed

+78
-68
lines changed

5 files changed

+78
-68
lines changed

pandas/_libs/hashtable.pxd

+4-4
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ cdef class Int64Vector:
5353
cdef Int64VectorData *data
5454
cdef ndarray ao
5555

56-
cdef resize(self)
57-
cpdef to_array(self)
58-
cdef inline void append(self, int64_t x)
59-
cdef extend(self, int64_t[:] x)
56+
cdef resize(self, refcheck=*)
57+
cpdef to_array(self, refcheck=*)
58+
cdef inline void append(self, int64_t x, refcheck=*)
59+
cdef extend(self, int64_t[:] x, refcheck=*)

pandas/_libs/hashtable.pyx

+4-4
Original file line numberDiff line numberDiff line change
@@ -101,14 +101,14 @@ cdef class Int64Factorizer:
101101
na_sentinel=-1, check_null=True):
102102
labels = self.table.get_labels(values, self.uniques,
103103
self.count, na_sentinel,
104-
check_null)
104+
check_null, refcheck=False)
105105

106106
# sort on
107107
if sort:
108108
if labels.dtype != np.intp:
109109
labels = labels.astype(np.intp)
110110

111-
sorter = self.uniques.to_array().argsort()
111+
sorter = self.uniques.to_array(refcheck=False).argsort()
112112
reverse_indexer = np.empty(len(sorter), dtype=np.intp)
113113
reverse_indexer.put(sorter, np.arange(len(sorter)))
114114

@@ -142,12 +142,12 @@ def unique_label_indices(ndarray[int64_t, ndim=1] labels):
142142
if ret != 0:
143143
if needs_resize(ud):
144144
with gil:
145-
idx.resize()
145+
idx.resize(refcheck=False)
146146
append_data_int64(ud, i)
147147

148148
kh_destroy_int64(table)
149149

150-
arr = idx.to_array()
150+
arr = idx.to_array(refcheck=False)
151151
arr = arr[labels[arr].argsort()]
152152

153153
return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr

pandas/_libs/hashtable_class_helper.pxi.in

+41-34
Original file line numberDiff line numberDiff line change
@@ -85,10 +85,11 @@ cdef class {{name}}Vector:
8585
self.ao = np.empty(self.data.m, dtype={{idtype}})
8686
self.data.data = <{{arg}}*> self.ao.data
8787

88-
cdef resize(self):
89-
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
90-
self.ao.resize(self.data.m)
88+
cdef resize(self, refcheck=True):
89+
m = max(self.data.m * 4, _INIT_VEC_CAP)
90+
self.ao.resize(m, refcheck=refcheck) # could raise, change m later
9191
self.data.data = <{{arg}}*> self.ao.data
92+
self.data.m = m
9293

9394
def __dealloc__(self):
9495
if self.data is not NULL:
@@ -98,21 +99,21 @@ cdef class {{name}}Vector:
9899
def __len__(self):
99100
return self.data.n
100101

101-
cpdef to_array(self):
102-
self.ao.resize(self.data.n)
102+
cpdef to_array(self, refcheck=True):
103+
self.ao.resize(self.data.n, refcheck=refcheck)
103104
self.data.m = self.data.n
105+
self.data.data = <{{arg}}*> self.ao.data
104106
return self.ao
105107

106-
cdef inline void append(self, {{arg}} x):
108+
cdef inline void append(self, {{arg}} x, refcheck=True):
107109

108110
if needs_resize(self.data):
109-
self.resize()
110-
111+
self.resize(refcheck=refcheck)
111112
append_data_{{dtype}}(self.data, x)
112113

113-
cdef extend(self, {{arg}}[:] x):
114+
cdef extend(self, {{arg}}[:] x, refcheck=True):
114115
for i in range(len(x)):
115-
self.append(x[i])
116+
self.append(x[i], refcheck=refcheck)
116117

117118
{{endfor}}
118119

@@ -130,11 +131,12 @@ cdef class StringVector:
130131
self.data.m = _INIT_VEC_CAP
131132
self.data.data = <char **> malloc(self.data.m * sizeof(char *))
132133

133-
cdef resize(self):
134+
cdef resize(self, refcheck=True):
134135
cdef:
135136
char **orig_data
136137
size_t i, m
137138

139+
# refcheck ignored, for compatibility only
138140
m = self.data.m
139141
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
140142

@@ -154,23 +156,24 @@ cdef class StringVector:
154156
def __len__(self):
155157
return self.data.n
156158

157-
def to_array(self):
159+
def to_array(self, refcheck=True):
158160
cdef:
159161
ndarray ao
160162
size_t n
161163
object val
162164

165+
# refcheck ignored, for compatibility only
163166
ao = np.empty(self.data.n, dtype=np.object)
164167
for i in range(self.data.n):
165168
val = self.data.data[i]
166169
ao[i] = val
167170
self.data.m = self.data.n
168171
return ao
169172

170-
cdef inline void append(self, char * x):
173+
cdef inline void append(self, char * x, refcheck=True):
171174

172175
if needs_resize(self.data):
173-
self.resize()
176+
self.resize(refcheck=refcheck)
174177

175178
append_data_string(self.data, x)
176179

@@ -191,18 +194,18 @@ cdef class ObjectVector:
191194
def __len__(self):
192195
return self.n
193196

194-
cdef inline append(self, object o):
197+
cdef inline append(self, object o, refcheck=True):
195198
if self.n == self.m:
196199
self.m = max(self.m * 2, _INIT_VEC_CAP)
197-
self.ao.resize(self.m)
200+
self.ao.resize(self.m, refcheck=refcheck)
198201
self.data = <PyObject**> self.ao.data
199202

200203
Py_INCREF(o)
201204
self.data[self.n] = <PyObject*> o
202205
self.n += 1
203206

204-
def to_array(self):
205-
self.ao.resize(self.n)
207+
def to_array(self, refcheck=True):
208+
self.ao.resize(self.n, refcheck=refcheck)
206209
self.m = self.n
207210
return self.ao
208211

@@ -324,13 +327,13 @@ cdef class {{name}}HashTable(HashTable):
324327

325328
def factorize(self, {{dtype}}_t values):
326329
uniques = {{name}}Vector()
327-
labels = self.get_labels(values, uniques, 0, 0)
328-
return uniques.to_array(), labels
330+
labels = self.get_labels(values, uniques, 0, 0, refcheck=False)
331+
return uniques.to_array(refcheck=False), labels
329332

330333
@cython.boundscheck(False)
331334
def get_labels(self, {{dtype}}_t[:] values, {{name}}Vector uniques,
332335
Py_ssize_t count_prior, Py_ssize_t na_sentinel,
333-
bint check_null=True):
336+
bint check_null=True, bint refcheck=True):
334337
cdef:
335338
Py_ssize_t i, n = len(values)
336339
int64_t[:] labels
@@ -362,7 +365,7 @@ cdef class {{name}}HashTable(HashTable):
362365

363366
if needs_resize(ud):
364367
with gil:
365-
uniques.resize()
368+
uniques.resize(refcheck=refcheck)
366369
append_data_{{dtype}}(ud, val)
367370
labels[i] = count
368371
count += 1
@@ -405,12 +408,12 @@ cdef class {{name}}HashTable(HashTable):
405408

406409
if needs_resize(ud):
407410
with gil:
408-
uniques.resize()
411+
uniques.resize(refcheck=False)
409412
append_data_{{dtype}}(ud, val)
410413
labels[i] = count
411414
count += 1
412415

413-
arr_uniques = uniques.to_array()
416+
arr_uniques = uniques.to_array(refcheck=False)
414417

415418
return np.asarray(labels), arr_uniques
416419

@@ -438,25 +441,25 @@ cdef class {{name}}HashTable(HashTable):
438441
kh_put_{{dtype}}(self.table, val, &ret)
439442
if needs_resize(ud):
440443
with gil:
441-
uniques.resize()
444+
uniques.resize(refcheck=False)
442445
append_data_{{dtype}}(ud, val)
443446
elif not seen_na:
444447
seen_na = 1
445448
if needs_resize(ud):
446449
with gil:
447-
uniques.resize()
450+
uniques.resize(refcheck=False)
448451
append_data_{{dtype}}(ud, NAN)
449452
{{else}}
450453
k = kh_get_{{dtype}}(self.table, val)
451454
if k == self.table.n_buckets:
452455
kh_put_{{dtype}}(self.table, val, &ret)
453456
if needs_resize(ud):
454457
with gil:
455-
uniques.resize()
458+
uniques.resize(refcheck=False)
456459
append_data_{{dtype}}(ud, val)
457460
{{endif}}
458461

459-
return uniques.to_array()
462+
return uniques.to_array(refcheck=False)
460463

461464
{{endfor}}
462465

@@ -571,12 +574,12 @@ cdef class StringHashTable(HashTable):
571574
uniques = ObjectVector()
572575
for i in range(count):
573576
uniques.append(values[uindexer[i]])
574-
return uniques.to_array()
577+
return uniques.to_array(refcheck=False)
575578

576579
def factorize(self, ndarray[object] values):
577580
uniques = ObjectVector()
578-
labels = self.get_labels(values, uniques, 0, 0)
579-
return uniques.to_array(), labels
581+
labels = self.get_labels(values, uniques, 0, 0, refcheck=False)
582+
return uniques.to_array(refcheck=False), labels
580583

581584
@cython.boundscheck(False)
582585
def lookup(self, ndarray[object] values):
@@ -642,7 +645,7 @@ cdef class StringHashTable(HashTable):
642645
@cython.boundscheck(False)
643646
def get_labels(self, ndarray[object] values, ObjectVector uniques,
644647
Py_ssize_t count_prior, int64_t na_sentinel,
645-
bint check_null=1):
648+
bint check_null=1, bint refcheck=1):
646649
cdef:
647650
Py_ssize_t i, n = len(values)
648651
int64_t[:] labels
@@ -654,6 +657,8 @@ cdef class StringHashTable(HashTable):
654657
char **vecs
655658
khiter_t k
656659

660+
# refcheck ignored, for compatibility only
661+
657662
# these by-definition *must* be strings
658663
labels = np.zeros(n, dtype=np.int64)
659664
uindexer = np.empty(n, dtype=np.int64)
@@ -811,11 +816,11 @@ cdef class PyObjectHashTable(HashTable):
811816
seen_na = 1
812817
uniques.append(nan)
813818

814-
return uniques.to_array()
819+
return uniques.to_array(refcheck=False)
815820

816821
def get_labels(self, ndarray[object] values, ObjectVector uniques,
817822
Py_ssize_t count_prior, int64_t na_sentinel,
818-
bint check_null=True):
823+
bint check_null=True, bint refcheck=True):
819824
cdef:
820825
Py_ssize_t i, n = len(values)
821826
int64_t[:] labels
@@ -824,6 +829,8 @@ cdef class PyObjectHashTable(HashTable):
824829
object val
825830
khiter_t k
826831

832+
# refcheck ignored, for compatibility only
833+
827834
labels = np.empty(n, dtype=np.int64)
828835

829836
for i in range(n):

0 commit comments

Comments
 (0)