Skip to content

Commit bf4b3f5

Browse files
committed
fixed-size arrays for get_index mapping
1 parent 5494a4c commit bf4b3f5

File tree

1 file changed

+70
-117
lines changed

1 file changed

+70
-117
lines changed

pandas/index.pyx

+70-117
Original file line numberDiff line numberDiff line change
@@ -59,133 +59,114 @@ ctypedef struct Int64List:
5959
@cython.boundscheck(False)
6060
@cython.wraparound(False)
6161
@cython.initializedcheck(False)
62-
cdef Int64List* Int64List_create_array(Py_ssize_t n) nogil:
62+
cdef _count(ndarray values, ndarray targets, int64_t[:] idx0, int64_t[:] idx1,
63+
int64_t[:] mapping_count, int64_t[:] missing_count):
6364

6465
cdef:
65-
Int64List *lst = <Int64List *> malloc(n * sizeof(Int64List))
66-
Py_ssize_t i
66+
int64_t n_v = values.shape[0]
67+
int64_t n_t = targets.shape[0]
68+
int64_t i = 0
69+
int64_t j = 0
6770

68-
for i in range(n):
69-
lst[i].n = 0
70-
lst[i].root = NULL
71-
lst[i].last = NULL
71+
while i < n_v and j < n_t:
7272

73-
return lst
74-
75-
@cython.boundscheck(False)
76-
@cython.wraparound(False)
77-
@cython.initializedcheck(False)
78-
cdef void Int64List_destroy_array(Int64List *lst, Py_ssize_t n) nogil:
79-
cdef:
80-
Int64ListNode *next
81-
Int64ListNode *p
82-
Py_ssize_t i
83-
84-
for i in range(n):
85-
if lst[i].owns:
86-
p = lst[i].root
87-
while p is not NULL:
88-
next = p[0].next
89-
free(p)
90-
p = next
73+
val0 = values[idx0[i]]
74+
val1 = targets[idx1[j]]
9175

92-
free(lst)
76+
if val0 == val1:
9377

94-
@cython.boundscheck(False)
95-
@cython.wraparound(False)
96-
@cython.initializedcheck(False)
97-
cdef inline void _append(Int64List *lst, int64_t x) nogil:
78+
while i < n_v and values[idx0[i]] == val1:
79+
i += 1
80+
mapping_count[idx1[j]] += 1
9881

99-
cdef Int64ListNode *nn = <Int64ListNode *> malloc(sizeof(Int64ListNode))
82+
j += 1
83+
while j < n_t and val0 == targets[idx1[j]]:
84+
mapping_count[idx1[j]] = mapping_count[idx1[j-1]]
85+
j += 1
10086

101-
nn[0].value = x
102-
nn[0].next = NULL
87+
elif val0 > val1:
10388

104-
if lst[0].root is NULL:
105-
lst[0].root = nn
106-
lst[0].owns = 1
107-
else:
108-
lst[0].last[0].next = nn
89+
mapping_count[idx1[j]] += 1
90+
missing_count[idx1[j]] = 1
91+
j += 1
10992

110-
lst[0].last = nn
111-
lst[0].n += 1
93+
else:
94+
i += 1
11295

113-
@cython.boundscheck(False)
114-
@cython.wraparound(False)
115-
@cython.initializedcheck(False)
116-
cdef inline void _copy_to(Int64List *dst, Int64List *src) nogil:
117-
dst[0].root = src[0].root
118-
dst[0].last = src[0].last
119-
dst[0].n = src[0].n
120-
dst[0].owns = 0
96+
while j < n_t:
97+
mapping_count[idx1[j]] += 1
98+
missing_count[idx1[j]] = 1
99+
j += 1
121100

122101
@cython.boundscheck(False)
123102
@cython.wraparound(False)
124103
@cython.initializedcheck(False)
125-
cdef int64_t* Int64List_concat_array(Int64List* lst, Py_ssize_t n,
126-
Py_ssize_t *nt) nogil:
127-
nt[0] = 0
128-
cdef:
129-
Py_ssize_t last = 0
130-
Int64ListNode* node
104+
cdef _map(ndarray values, ndarray targets, int64_t[:] idx0, int64_t[:] idx1,
105+
int64_t[:] start_mapping, int64_t[:] start_missing,
106+
int64_t[:] mapping, int64_t[:] missing):
131107

132-
for i in range(n):
133-
nt[0] += lst[i].n
134-
135-
cdef int64_t *data = <int64_t *> malloc(nt[0] * sizeof(int64_t))
136-
137-
for i in range(n):
138-
139-
node = lst[i].root
140-
while node is not NULL:
141-
data[last] = node[0].value
142-
last += 1
143-
node = node[0].next
144-
145-
return data
146-
147-
148-
@cython.boundscheck(False)
149-
@cython.wraparound(False)
150-
@cython.initializedcheck(False)
151-
cdef _indexer_non_unique_orderable_loop(ndarray values, ndarray targets,
152-
int64_t[:] idx0,
153-
int64_t[:] idx1,
154-
Int64List* result,
155-
Int64List* missing):
156108
cdef:
157-
Py_ssize_t i = 0, j = 0, n = idx0.shape[0], n_t = idx1.shape[0]
109+
int64_t n_v = values.shape[0]
110+
int64_t n_t = targets.shape[0]
111+
int64_t i = 0
112+
int64_t j = 0
113+
int64_t c
158114

159-
while i < n and j < n_t:
115+
while i < n_v and j < n_t:
160116

161117
val0 = values[idx0[i]]
162118
val1 = targets[idx1[j]]
163119

164120
if val0 == val1:
165121

166-
while i < n and values[idx0[i]] == val1:
167-
_append(&(result[idx1[j]]), idx0[i])
168-
i += 1
122+
c = 0
123+
while i < n_v and values[idx0[i]] == val1:
124+
mapping[start_mapping[idx1[j]] + c] = idx0[i]
125+
i += 1
126+
c += 1
169127

170128
j += 1
171129
while j < n_t and val0 == targets[idx1[j]]:
172-
_copy_to(&(result[idx1[j]]), &(result[idx1[j-1]]))
130+
for ii in range(c):
131+
mapping[start_mapping[idx1[j]] + ii] = \
132+
mapping[start_mapping[idx1[j-1]] + ii]
173133
j += 1
174134

175135
elif val0 > val1:
176136

177-
_append(&(result[idx1[j]]), -1)
178-
_append(&(missing[idx1[j]]), idx1[j])
137+
mapping[start_mapping[idx1[j]]] = -1
138+
missing[start_missing[idx1[j]]] = idx1[j]
179139
j += 1
180140

181141
else:
182142
i += 1
183143

184144
while j < n_t:
185-
_append(&(result[idx1[j]]), -1)
186-
_append(&(missing[idx1[j]]), idx1[j])
145+
146+
mapping[start_mapping[idx1[j]]] = -1
147+
missing[start_missing[idx1[j]]] = idx1[j]
187148
j += 1
188149

150+
def _map_targets_to_values(values, targets, idx0, idx1):
151+
mapping_count = np.zeros(len(targets), int)
152+
missing_count = np.zeros(len(targets), int)
153+
154+
_count(values, targets, idx0, idx1, mapping_count, missing_count)
155+
156+
np.cumsum(mapping_count, out=mapping_count)
157+
np.cumsum(missing_count, out=missing_count)
158+
159+
mapping = np.empty(mapping_count[-1], int)
160+
missing = np.empty(missing_count[-1], int)
161+
162+
mapping_count[1:] = mapping_count[:-1]
163+
mapping_count[0] = 0
164+
missing_count -= 1
165+
166+
_map(values, targets, idx0, idx1, mapping_count, missing_count, mapping,
167+
missing)
168+
169+
return mapping, missing
189170

190171
cdef inline is_definitely_invalid_key(object val):
191172
if PyTuple_Check(val):
@@ -524,35 +505,7 @@ cdef class IndexEngine:
524505

525506
self._ensure_mapping_populated()
526507
values = self._get_index_values()
527-
n_t = len(targets)
528-
529-
cdef:
530-
Int64List* result = Int64List_create_array(n_t)
531-
Int64List* missing = Int64List_create_array(n_t)
532-
533-
_indexer_non_unique_orderable_loop(values, targets, idx0, idx1,
534-
result, missing)
535-
536-
cdef:
537-
Py_ssize_t nres, nmis
538-
int64_t *cresult
539-
int64_t *cmissing
540-
541-
cresult = Int64List_concat_array(result, n_t, &nres)
542-
cmissing = Int64List_concat_array(missing, n_t, &nmis)
543-
544-
Int64List_destroy_array(result, n_t)
545-
Int64List_destroy_array(missing, n_t)
546-
547-
cdef:
548-
cnp.npy_intp *dims0 = [nres]
549-
cnp.npy_intp *dims1 = [nmis]
550-
ndarray npy_result = PyArray_SimpleNewFromData(1, dims0,
551-
NPY_INT64, cresult)
552-
ndarray npy_missing = PyArray_SimpleNewFromData(1, dims1,
553-
NPY_INT64, cmissing)
554-
555-
return npy_result, npy_missing
508+
return _map_targets_to_values(values, targets, idx0, idx1)
556509

557510
cdef Py_ssize_t _bin_search(ndarray values, object val) except -1:
558511
cdef:

0 commit comments

Comments
 (0)