Skip to content

Commit 56233e4

Browse files
committed
add 'refcheck=True' as kwarg to functions that call resize, use refcheck=False where safe
1 parent eaf50fe commit 56233e4

File tree

5 files changed

+41
-39
lines changed

5 files changed

+41
-39
lines changed

pandas/_libs/hashtable.pxd

+3-3
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ cdef class Int64Vector:
5353
cdef Int64VectorData *data
5454
cdef ndarray ao
5555

56-
cdef resize(self)
57-
cpdef to_array(self)
58-
cdef inline void append(self, int64_t x)
56+
cdef resize(self, refcheck=*)
57+
cpdef to_array(self, refcheck=*)
58+
cdef inline void append(self, int64_t x, refcheck=*)
5959
cdef extend(self, int64_t[:] x)

pandas/_libs/hashtable_class_helper.pxi.in

+35-32
Original file line numberDiff line numberDiff line change
@@ -85,9 +85,9 @@ cdef class {{name}}Vector:
8585
self.ao = np.empty(self.data.m, dtype={{idtype}})
8686
self.data.data = <{{arg}}*> self.ao.data
8787

88-
cdef resize(self):
88+
cdef resize(self, refcheck=True):
8989
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
90-
self.ao.resize(self.data.m)
90+
self.ao.resize(self.data.m, refcheck=refcheck)
9191
self.data.data = <{{arg}}*> self.ao.data
9292

9393
def __dealloc__(self):
@@ -98,15 +98,15 @@ cdef class {{name}}Vector:
9898
def __len__(self):
9999
return self.data.n
100100

101-
cpdef to_array(self):
102-
self.ao.resize(self.data.n)
101+
cpdef to_array(self, refcheck=True):
102+
self.ao.resize(self.data.n, refcheck=refcheck)
103103
self.data.m = self.data.n
104104
return self.ao
105105

106-
cdef inline void append(self, {{arg}} x):
106+
cdef inline void append(self, {{arg}} x, refcheck=True):
107107

108108
if needs_resize(self.data):
109-
self.resize()
109+
self.resize(refcheck=refcheck)
110110

111111
append_data_{{dtype}}(self.data, x)
112112

@@ -130,11 +130,12 @@ cdef class StringVector:
130130
self.data.m = _INIT_VEC_CAP
131131
self.data.data = <char **> malloc(self.data.m * sizeof(char *))
132132

133-
cdef resize(self):
133+
cdef resize(self, refcheck=True):
134134
cdef:
135135
char **orig_data
136136
size_t i, m
137137

138+
# refcheck ignored, for compatibility only
138139
m = self.data.m
139140
self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
140141

@@ -154,23 +155,24 @@ cdef class StringVector:
154155
def __len__(self):
155156
return self.data.n
156157

157-
def to_array(self):
158+
def to_array(self, refcheck=True):
158159
cdef:
159160
ndarray ao
160161
size_t n
161162
object val
162-
163+
164+
# refcheck is unused but needed for API compatibility
163165
ao = np.empty(self.data.n, dtype=np.object)
164166
for i in range(self.data.n):
165167
val = self.data.data[i]
166168
ao[i] = val
167169
self.data.m = self.data.n
168170
return ao
169171

170-
cdef inline void append(self, char * x):
172+
cdef inline void append(self, char * x, refcheck=True):
171173

172174
if needs_resize(self.data):
173-
self.resize()
175+
self.resize(refcheck=refcheck)
174176

175177
append_data_string(self.data, x)
176178

@@ -191,18 +193,18 @@ cdef class ObjectVector:
191193
def __len__(self):
192194
return self.n
193195

194-
cdef inline append(self, object o):
196+
cdef inline append(self, object o, refcheck=True):
195197
if self.n == self.m:
196198
self.m = max(self.m * 2, _INIT_VEC_CAP)
197-
self.ao.resize(self.m)
199+
self.ao.resize(self.m, refcheck=refcheck)
198200
self.data = <PyObject**> self.ao.data
199201

200202
Py_INCREF(o)
201203
self.data[self.n] = <PyObject*> o
202204
self.n += 1
203205

204-
def to_array(self):
205-
self.ao.resize(self.n)
206+
def to_array(self, refcheck=True):
207+
self.ao.resize(self.n, refcheck=refcheck)
206208
self.m = self.n
207209
return self.ao
208210

@@ -324,13 +326,13 @@ cdef class {{name}}HashTable(HashTable):
324326

325327
def factorize(self, {{dtype}}_t values):
326328
uniques = {{name}}Vector()
327-
labels = self.get_labels(values, uniques, 0, 0)
328-
return uniques.to_array(), labels
329+
labels = self.get_labels(values, uniques, 0, 0, refcheck=False)
330+
return uniques.to_array(refcheck=False), labels
329331

330332
@cython.boundscheck(False)
331333
def get_labels(self, {{dtype}}_t[:] values, {{name}}Vector uniques,
332334
Py_ssize_t count_prior, Py_ssize_t na_sentinel,
333-
bint check_null=True):
335+
bint check_null=True, bint refcheck=True):
334336
cdef:
335337
Py_ssize_t i, n = len(values)
336338
int64_t[:] labels
@@ -362,7 +364,7 @@ cdef class {{name}}HashTable(HashTable):
362364

363365
if needs_resize(ud):
364366
with gil:
365-
uniques.resize()
367+
uniques.resize(refcheck=refcheck)
366368
append_data_{{dtype}}(ud, val)
367369
labels[i] = count
368370
count += 1
@@ -405,12 +407,12 @@ cdef class {{name}}HashTable(HashTable):
405407

406408
if needs_resize(ud):
407409
with gil:
408-
uniques.resize()
410+
uniques.resize(refcheck=False)
409411
append_data_{{dtype}}(ud, val)
410412
labels[i] = count
411413
count += 1
412414

413-
arr_uniques = uniques.to_array()
415+
arr_uniques = uniques.to_array(refcheck=False)
414416

415417
return np.asarray(labels), arr_uniques
416418

@@ -438,25 +440,25 @@ cdef class {{name}}HashTable(HashTable):
438440
kh_put_{{dtype}}(self.table, val, &ret)
439441
if needs_resize(ud):
440442
with gil:
441-
uniques.resize()
443+
uniques.resize(refcheck=False)
442444
append_data_{{dtype}}(ud, val)
443445
elif not seen_na:
444446
seen_na = 1
445447
if needs_resize(ud):
446448
with gil:
447-
uniques.resize()
449+
uniques.resize(refcheck=False)
448450
append_data_{{dtype}}(ud, NAN)
449451
{{else}}
450452
k = kh_get_{{dtype}}(self.table, val)
451453
if k == self.table.n_buckets:
452454
kh_put_{{dtype}}(self.table, val, &ret)
453455
if needs_resize(ud):
454456
with gil:
455-
uniques.resize()
457+
uniques.resize(refcheck=False)
456458
append_data_{{dtype}}(ud, val)
457459
{{endif}}
458460

459-
return uniques.to_array()
461+
return uniques.to_array(refcheck=False)
460462

461463
{{endfor}}
462464

@@ -571,12 +573,12 @@ cdef class StringHashTable(HashTable):
571573
uniques = ObjectVector()
572574
for i in range(count):
573575
uniques.append(values[uindexer[i]])
574-
return uniques.to_array()
576+
return uniques.to_array(refcheck=False)
575577

576578
def factorize(self, ndarray[object] values):
577579
uniques = ObjectVector()
578-
labels = self.get_labels(values, uniques, 0, 0)
579-
return uniques.to_array(), labels
580+
labels = self.get_labels(values, uniques, 0, 0, refcheck=0)
581+
return uniques.to_array(refcheck=False), labels
580582

581583
@cython.boundscheck(False)
582584
def lookup(self, ndarray[object] values):
@@ -642,7 +644,7 @@ cdef class StringHashTable(HashTable):
642644
@cython.boundscheck(False)
643645
def get_labels(self, ndarray[object] values, ObjectVector uniques,
644646
Py_ssize_t count_prior, int64_t na_sentinel,
645-
bint check_null=1):
647+
bint check_null=1, bint refcheck=1):
646648
cdef:
647649
Py_ssize_t i, n = len(values)
648650
int64_t[:] labels
@@ -654,6 +656,7 @@ cdef class StringHashTable(HashTable):
654656
char **vecs
655657
khiter_t k
656658

659+
# refcheck is for compatibility
657660
# these by-definition *must* be strings
658661
labels = np.zeros(n, dtype=np.int64)
659662
uindexer = np.empty(n, dtype=np.int64)
@@ -811,11 +814,11 @@ cdef class PyObjectHashTable(HashTable):
811814
seen_na = 1
812815
uniques.append(nan)
813816

814-
return uniques.to_array()
817+
return uniques.to_array(refcheck=False)
815818

816819
def get_labels(self, ndarray[object] values, ObjectVector uniques,
817820
Py_ssize_t count_prior, int64_t na_sentinel,
818-
bint check_null=True):
821+
bint check_null=True, bint refcheck=True):
819822
cdef:
820823
Py_ssize_t i, n = len(values)
821824
int64_t[:] labels
@@ -968,5 +971,5 @@ cdef class MultiIndexHashTable(HashTable):
968971

969972
def get_labels(self, object mi, ObjectVector uniques,
970973
Py_ssize_t count_prior, int64_t na_sentinel,
971-
bint check_null=True):
974+
bint check_null=True, bint refcheck=True):
972975
raise NotImplementedError

pandas/core/algorithms.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -349,11 +349,11 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
349349
table = hash_klass(size_hint or len(vals))
350350
uniques = vec_klass()
351351
check_nulls = not is_integer_dtype(values)
352-
labels = table.get_labels(vals, uniques, 0, na_sentinel, check_nulls)
352+
labels = table.get_labels(vals, uniques, 0, na_sentinel, check_nulls, refcheck=False)
353353

354354
labels = _ensure_platform_int(labels)
355355

356-
uniques = uniques.to_array()
356+
uniques = uniques.to_array(refcheck=False)
357357

358358
if sort and len(uniques) > 0:
359359
uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel,

pandas/core/categorical.py

-1
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,6 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False):
289289
"explicitly specify the categories order "
290290
"by passing in a categories argument.")
291291
except ValueError:
292-
293292
# FIXME
294293
raise NotImplementedError("> 1 ndim Categorical are not "
295294
"supported at this time")

pandas/tools/merge.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1410,7 +1410,7 @@ def _factorize_keys(lk, rk, sort=True):
14101410
count = rizer.get_count()
14111411

14121412
if sort:
1413-
uniques = rizer.uniques.to_array()
1413+
uniques = rizer.uniques.to_array(refcheck=False)
14141414
llab, rlab = _sort_labels(uniques, llab, rlab)
14151415

14161416
# NA group

0 commit comments

Comments
 (0)