Skip to content

Commit a749912

Browse files
committed
ENH: optimize join/merge on integer keys, close #682
1 parent 50ae526 commit a749912

File tree

4 files changed

+28
-25
lines changed

4 files changed

+28
-25
lines changed

RELEASE.rst

+1
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ pandas 0.8.0
5757
- Improve the speed of "square" reindexing of homogeneous DataFrame objects
5858
by significant margin (#836)
5959
- Handle more dtypes when passed MaskedArrays in DataFrame constructor (#406)
60+
- Improved performance of join operations on integer keys (#682)
6061

6162
**API Changes**
6263

pandas/src/hashtable.pyx

+3-2
Original file line numberDiff line numberDiff line change
@@ -823,9 +823,10 @@ cdef class Int64Factorizer:
823823
def get_count(self):
824824
return self.count
825825

826-
def factorize(self, ndarray[int64_t] values, sort=False):
826+
def factorize(self, ndarray[int64_t] values, sort=False,
827+
na_sentinel=-1):
827828
labels, counts = self.table.get_labels(values, self.uniques,
828-
self.count, -1)
829+
self.count, na_sentinel)
829830

830831
# sort on
831832
if sort:

pandas/tools/merge.py

+18-23
Original file line numberDiff line numberDiff line change
@@ -304,7 +304,7 @@ def _get_group_keys(self):
304304
group_sizes = []
305305

306306
for lk, rk in zip(left_keys, right_keys):
307-
llab, rlab, count = _factorize_objects(lk, rk, sort=self.sort)
307+
llab, rlab, count = _factorize_keys(lk, rk, sort=self.sort)
308308

309309
left_labels.append(llab)
310310
right_labels.append(rlab)
@@ -321,24 +321,24 @@ def _get_group_keys(self):
321321
raise Exception('Combinatorial explosion! (boom)')
322322

323323
left_group_key, right_group_key, max_groups = \
324-
_factorize_int64(left_group_key, right_group_key,
324+
_factorize_keys(left_group_key, right_group_key,
325325
sort=self.sort)
326326
return left_group_key, right_group_key, max_groups
327327

328328
def _get_multiindex_indexer(join_keys, index, sort=False):
329329
shape = []
330330
labels = []
331331
for level, key in zip(index.levels, join_keys):
332-
llab, rlab, count = _factorize_objects(level, key, sort=False)
332+
llab, rlab, count = _factorize_keys(level, key, sort=False)
333333
labels.append(rlab)
334334
shape.append(count)
335335

336336
left_group_key = get_group_index(labels, shape)
337337
right_group_key = get_group_index(index.labels, shape)
338338

339339
left_group_key, right_group_key, max_groups = \
340-
_factorize_int64(left_group_key, right_group_key,
341-
sort=False)
340+
_factorize_keys(left_group_key, right_group_key,
341+
sort=False)
342342

343343
left_indexer, right_indexer = \
344344
lib.left_outer_join(com._ensure_int64(left_group_key),
@@ -348,7 +348,7 @@ def _get_multiindex_indexer(join_keys, index, sort=False):
348348
return left_indexer, right_indexer
349349

350350
def _get_single_indexer(join_key, index, sort=False):
351-
left_key, right_key, count = _factorize_objects(join_key, index, sort=sort)
351+
left_key, right_key, count = _factorize_keys(join_key, index, sort=sort)
352352

353353
left_indexer, right_indexer = \
354354
lib.left_outer_join(com._ensure_int64(left_key),
@@ -394,26 +394,21 @@ def _left_join_on_index(left_ax, right_ax, join_keys, sort=False):
394394
'outer' : lib.full_outer_join,
395395
}
396396

397-
def _factorize_int64(left_index, right_index, sort=True):
398-
rizer = lib.Int64Factorizer(max(len(left_index), len(right_index)))
399397

400-
# 32-bit compatibility
401-
left_index = com._ensure_int64(left_index)
402-
right_index = com._ensure_int64(right_index)
403-
404-
llab, _ = rizer.factorize(left_index)
405-
rlab, _ = rizer.factorize(right_index)
406-
407-
if sort:
408-
llab, rlab = _sort_labels(np.array(rizer.uniques), llab, rlab)
409-
410-
return llab, rlab, rizer.get_count()
398+
def _factorize_keys(lk, rk, sort=True):
399+
if com.is_integer_dtype(lk) and com.is_integer_dtype(rk):
400+
klass = lib.Int64Factorizer
401+
lk = com._ensure_int64(lk)
402+
rk = com._ensure_int64(rk)
403+
else:
404+
klass = lib.Factorizer
405+
lk = com._ensure_object(lk)
406+
rk = com._ensure_object(rk)
411407

412-
def _factorize_objects(left_index, right_index, sort=True):
413-
rizer = lib.Factorizer(max(len(left_index), len(right_index)))
408+
rizer = klass(max(len(lk), len(rk)))
414409

415-
llab, _ = rizer.factorize(left_index.astype('O'))
416-
rlab, _ = rizer.factorize(right_index.astype('O'))
410+
llab, _ = rizer.factorize(lk)
411+
rlab, _ = rizer.factorize(rk)
417412

418413
count = rizer.get_count()
419414

vb_suite/join_merge.py

+6
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,12 @@
6666
name='join_dataframe_index_multi',
6767
start_date=datetime(2011, 10, 20))
6868

69+
#----------------------------------------------------------------------
70+
# Joins on integer keys
71+
72+
join_dataframe_integer_key = Benchmark("merge(df, df2, on='key')", setup,
73+
start_date=datetime(2011, 10, 20))
74+
6975
#----------------------------------------------------------------------
7076
# DataFrame joins on index
7177

0 commit comments

Comments
 (0)