@@ -126,150 +126,56 @@ def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right,
126
126
127
127
128
128
def left_outer_asof_join (ndarray[int64_t] left , ndarray[int64_t] right ,
129
- Py_ssize_t max_groups , sort = True ,
129
+ Py_ssize_t max_groups , # ignored
130
130
bint allow_exact_matches = 1 ,
131
- left_distance = None ,
132
- right_distance = None ,
131
+ left_values = None ,
132
+ right_values = None ,
133
133
tolerance = None ):
134
134
135
135
cdef:
136
- Py_ssize_t i, j, k, count = 0
137
- Py_ssize_t loc, left_pos, right_pos, position
138
- Py_ssize_t offset
139
- ndarray[int64_t] left_count, right_count
140
- ndarray left_sorter, right_sorter, rev
136
+ Py_ssize_t left_pos, right_pos, left_size, right_size
141
137
ndarray[int64_t] left_indexer, right_indexer
142
- int64_t lc, rc, tol, left_val, right_val, diff, indexer
143
- ndarray[int64_t] ld, rd
144
- bint has_tol = 0
138
+ bint has_tolerance = 0
139
+ ndarray[int64_t] left_values_, right_values_
140
+ int64_t tolerance_
145
141
146
142
# if we are using tolerance, set our objects
147
- if left_distance is not None and right_distance is not None and tolerance is not None :
148
- has_tol = 1
149
- ld = left_distance
150
- rd = right_distance
151
- tol = tolerance
143
+ if left_values is not None and right_values is not None and tolerance is not None :
144
+ has_tolerance = 1
145
+ left_values_ = left_values
146
+ right_values_ = right_values
147
+ tolerance_ = tolerance
152
148
153
- # NA group in location 0
154
- left_sorter, left_count = groupsort_indexer(left, max_groups)
155
- right_sorter, right_count = groupsort_indexer(right, max_groups)
149
+ left_size = len (left)
150
+ right_size = len (right)
156
151
157
- # First pass, determine size of result set, do not use the NA group
158
- for i in range (1 , max_groups + 1 ):
159
- if right_count[i] > 0 :
160
- count += left_count[i] * right_count[i]
161
- else :
162
- count += left_count[i]
152
+ left_indexer = np.empty(left_size, dtype = np.int64)
153
+ right_indexer = np.empty(left_size, dtype = np.int64)
163
154
164
- # group 0 is the NA group
165
- left_pos = 0
166
155
right_pos = 0
167
- position = 0
168
-
169
- # exclude the NA group
170
- left_pos = left_count[0 ]
171
- right_pos = right_count[0 ]
172
-
173
- left_indexer = np.empty(count, dtype = np.int64)
174
- right_indexer = np.empty(count, dtype = np.int64)
175
-
176
- for i in range (1 , max_groups + 1 ):
177
- lc = left_count[i]
178
- rc = right_count[i]
179
-
180
- if rc == 0 :
181
- for j in range (lc):
182
- indexer = position + j
183
- left_indexer[indexer] = left_pos + j
184
-
185
- # take the most recent value
186
- # if we are not the first
187
- if right_pos:
188
-
189
- if has_tol:
190
-
191
- left_val = ld[left_pos + j]
192
- right_val = rd[right_pos - 1 ]
193
- diff = left_val - right_val
194
-
195
- # do we allow exact matches
196
- if allow_exact_matches:
197
- if diff > tol:
198
- right_indexer[indexer] = - 1
199
- continue
200
- elif not allow_exact_matches:
201
- if diff >= tol or lc == rc:
202
- right_indexer[indexer] = - 1
203
- continue
204
-
205
- right_indexer[indexer] = right_pos - 1
206
- else :
207
- right_indexer[indexer] = - 1
208
- position += lc
156
+ for left_pos in range (left_size):
157
+ # restart right_pos if it went negative in a previous iteration
158
+ if right_pos < 0 :
159
+ right_pos = 0
160
+
161
+ # find last position in right whose value is less than left's value
162
+ if allow_exact_matches:
163
+ while right_pos < right_size and right[right_pos] <= left[left_pos]:
164
+ right_pos += 1
209
165
else :
210
- for j in range (lc):
211
- offset = position + j * rc
212
- for k in range (rc):
213
-
214
- indexer = offset + k
215
- left_indexer[indexer] = left_pos + j
216
-
217
- if has_tol:
218
-
219
- left_val = ld[left_pos + j]
220
- right_val = rd[right_pos + k]
221
- diff = left_val - right_val
222
-
223
- # do we allow exact matches
224
- if allow_exact_matches:
225
- if diff > tol:
226
- right_indexer[indexer] = - 1
227
- continue
228
-
229
- # we don't allow exact matches
230
- elif not allow_exact_matches:
231
- if diff >= tol or lc == rc:
232
- right_indexer[indexer] = - 1
233
- else :
234
- right_indexer[indexer] = right_pos - 1
235
- continue
236
-
237
- else :
238
-
239
- # do we allow exact matches
240
- if not allow_exact_matches:
241
-
242
- if right_pos:
243
- right_indexer[indexer] = right_pos - 1
244
- else :
245
- right_indexer[indexer] = - 1
246
- continue
247
-
248
- right_indexer[indexer] = right_pos + k
249
- position += lc * rc
250
- left_pos += lc
251
- right_pos += rc
252
-
253
- left_indexer = _get_result_indexer(left_sorter, left_indexer)
254
- right_indexer = _get_result_indexer(right_sorter, right_indexer)
255
-
256
- if not sort: # if not asked to sort, revert to original order
257
- if len (left) == len (left_indexer):
258
- # no multiple matches for any row on the left
259
- # this is a short-cut to avoid groupsort_indexer
260
- # otherwise, the `else` path also works in this case
261
- if left_sorter.dtype != np.int_:
262
- left_sorter = left_sorter.astype(np.int_)
263
-
264
- rev = np.empty(len (left), dtype = np.int_)
265
- rev.put(left_sorter, np.arange(len (left)))
266
- else :
267
- rev, _ = groupsort_indexer(left_indexer, len (left))
268
-
269
- if rev.dtype != np.int_:
270
- rev = rev.astype(np.int_)
271
- right_indexer = right_indexer.take(rev)
272
- left_indexer = left_indexer.take(rev)
166
+ while right_pos < right_size and right[right_pos] < left[left_pos]:
167
+ right_pos += 1
168
+ right_pos -= 1
169
+
170
+ # save positions as the desired index
171
+ left_indexer[left_pos] = left_pos
172
+ right_indexer[left_pos] = right_pos
173
+
174
+ # if needed, verify that tolerance is met
175
+ if has_tolerance and right_pos != - 1 :
176
+ diff = left_values[left_pos] - right_values[right_pos]
177
+ if diff > tolerance_:
178
+ right_indexer[left_pos] = - 1
273
179
274
180
return left_indexer, right_indexer
275
181
0 commit comments