Skip to content

Commit 7ff782b

Browse files
committed
improves merge performance when key space exceeds i8 bounds
1 parent def58c9 commit 7ff782b

File tree

4 files changed

+184
-29
lines changed

4 files changed

+184
-29
lines changed

doc/source/whatsnew/v0.16.0.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ Performance
6868
- Performance improvements in ``MultiIndex.duplicated`` by working with labels instead of values (:issue:`9125`)
6969
- Improved the speed of `nunique` by calling `unique` instead of `value_counts` (:issue:`9129`, :issue:`7771`)
7070
- Performance improvement of up to 10x in ``DataFrame.count`` and ``DataFrame.dropna`` by taking advantage of homogeneous/heterogeneous dtypes appropriately (:issue:`9136`)
71+
- Performance and memory usage improvements in ``merge`` when key space exceeds ``int64`` bounds (:issue:`9151`)
7172

7273
Bug Fixes
7374
~~~~~~~~~

pandas/tools/merge.py

Lines changed: 47 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import types
55

66
import numpy as np
7-
from pandas.compat import range, long, lrange, lzip, zip
7+
from pandas.compat import range, long, lrange, lzip, zip, map, filter
88
import pandas.compat as compat
99
from pandas.core.categorical import Categorical
1010
from pandas.core.frame import DataFrame, _merge_doc
@@ -450,39 +450,29 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner'):
450450
-------
451451
452452
"""
453-
if len(left_keys) != len(right_keys):
454-
raise AssertionError('left_key and right_keys must be the same length')
453+
from functools import partial
455454

456-
left_labels = []
457-
right_labels = []
458-
group_sizes = []
455+
assert len(left_keys) == len(right_keys), \
456+
'left_key and right_keys must be the same length'
459457

460-
for lk, rk in zip(left_keys, right_keys):
461-
llab, rlab, count = _factorize_keys(lk, rk, sort=sort)
458+
# bind `sort` arg. of _factorize_keys
459+
fkeys = partial(_factorize_keys, sort=sort)
462460

463-
left_labels.append(llab)
464-
right_labels.append(rlab)
465-
group_sizes.append(count)
461+
# get left & right join labels and num. of levels at each location
462+
llab, rlab, shape = map(list, zip( * map(fkeys, left_keys, right_keys)))
466463

467-
max_groups = long(1)
468-
for x in group_sizes:
469-
max_groups *= long(x)
464+
# get flat i8 keys from label lists
465+
lkey, rkey = _get_join_keys(llab, rlab, shape, sort)
470466

471-
if max_groups > 2 ** 63: # pragma: no cover
472-
left_group_key, right_group_key, max_groups = \
473-
_factorize_keys(lib.fast_zip(left_labels),
474-
lib.fast_zip(right_labels))
475-
else:
476-
left_group_key = get_group_index(left_labels, group_sizes)
477-
right_group_key = get_group_index(right_labels, group_sizes)
478-
479-
left_group_key, right_group_key, max_groups = \
480-
_factorize_keys(left_group_key, right_group_key, sort=sort)
467+
# factorize keys to a dense i8 space
468+
# `count` is the num. of unique keys
469+
# set(lkey) | set(rkey) == range(count)
470+
lkey, rkey, count = fkeys(lkey, rkey)
481471

482472
# preserve left frame order if how == 'left' and sort == False
483473
kwargs = {'sort':sort} if how == 'left' else {}
484474
join_func = _join_functions[how]
485-
return join_func(left_group_key, right_group_key, max_groups, **kwargs)
475+
return join_func(lkey, rkey, count, **kwargs)
486476

487477

488478
class _OrderedMerge(_MergeOperation):
@@ -590,9 +580,9 @@ def _left_join_on_index(left_ax, right_ax, join_keys, sort=False):
590580
# if asked to sort or there are 1-to-many matches
591581
join_index = left_ax.take(left_indexer)
592582
return join_index, left_indexer, right_indexer
593-
else:
594-
# left frame preserves order & length of its index
595-
return left_ax, None, right_indexer
583+
584+
# left frame preserves order & length of its index
585+
return left_ax, None, right_indexer
596586

597587

598588
def _right_outer_join(x, y, max_groups):
@@ -663,6 +653,35 @@ def _sort_labels(uniques, left, right):
663653
return new_left, new_right
664654

665655

656+
def _get_join_keys(llab, rlab, shape, sort):
657+
from pandas.core.groupby import _int64_overflow_possible
658+
659+
# how many levels can be done without overflow
660+
pred = lambda i: not _int64_overflow_possible(shape[:i])
661+
nlev = next(filter(pred, range(len(shape), 0, -1)))
662+
663+
# get keys for the first `nlev` levels
664+
stride = np.prod(shape[1:nlev], dtype='i8')
665+
lkey = stride * llab[0].astype('i8', subok=False, copy=False)
666+
rkey = stride * rlab[0].astype('i8', subok=False, copy=False)
667+
668+
for i in range(1, nlev):
669+
stride //= shape[i]
670+
lkey += llab[i] * stride
671+
rkey += rlab[i] * stride
672+
673+
if nlev == len(shape): # all done!
674+
return lkey, rkey
675+
676+
# densify current keys to avoid overflow
677+
lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)
678+
679+
llab = [lkey] + llab[nlev:]
680+
rlab = [rkey] + rlab[nlev:]
681+
shape = [count] + shape[nlev:]
682+
683+
return _get_join_keys(llab, rlab, shape, sort)
684+
666685
#----------------------------------------------------------------------
667686
# Concatenate DataFrame objects
668687

pandas/tools/tests/test_merge.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1141,6 +1141,10 @@ def test_merge_na_keys(self):
11411141
tm.assert_frame_equal(result, expected)
11421142

11431143
def test_int64_overflow_issues(self):
1144+
from itertools import product
1145+
from collections import defaultdict
1146+
from pandas.core.groupby import _int64_overflow_possible
1147+
11441148
# #2690, combinatorial explosion
11451149
df1 = DataFrame(np.random.randn(1000, 7),
11461150
columns=list('ABCDEF') + ['G1'])
@@ -1151,6 +1155,119 @@ def test_int64_overflow_issues(self):
11511155
result = merge(df1, df2, how='outer')
11521156
self.assertTrue(len(result) == 2000)
11531157

1158+
low, high, n = -1 << 10, 1 << 10, 1 << 20
1159+
left = DataFrame(np.random.randint(low, high, (n, 7)),
1160+
columns=list('ABCDEFG'))
1161+
left['left'] = left.sum(axis=1)
1162+
1163+
# one-2-one match
1164+
i = np.random.permutation(len(left))
1165+
right = left.iloc[i].copy()
1166+
right.columns = right.columns[:-1].tolist() + ['right']
1167+
right.index = np.arange(len(right))
1168+
right['right'] *= -1
1169+
1170+
out = merge(left, right, how='outer')
1171+
self.assertEqual(len(out), len(left))
1172+
assert_series_equal(out['left'], - out['right'])
1173+
assert_series_equal(out['left'], out.iloc[:, :-2].sum(axis=1))
1174+
1175+
out.sort(out.columns.tolist(), inplace=True)
1176+
out.index = np.arange(len(out))
1177+
for how in ['left', 'right', 'outer', 'inner']:
1178+
assert_frame_equal(out, merge(left, right, how=how, sort=True))
1179+
1180+
# check that left merge w/ sort=False maintains left frame order
1181+
out = merge(left, right, how='left', sort=False)
1182+
assert_frame_equal(left, out[left.columns.tolist()])
1183+
1184+
out = merge(right, left, how='left', sort=False)
1185+
assert_frame_equal(right, out[right.columns.tolist()])
1186+
1187+
# one-2-many/none match
1188+
n = 1 << 11
1189+
left = DataFrame(np.random.randint(low, high, (n, 7)),
1190+
columns=list('ABCDEFG'))
1191+
1192+
# confirm that this is checking what it is supposed to check
1193+
shape = left.apply(pd.Series.nunique).values
1194+
self.assertTrue(_int64_overflow_possible(shape))
1195+
1196+
# add duplicates to left frame
1197+
left = pd.concat([left, left], ignore_index=True)
1198+
1199+
right = DataFrame(np.random.randint(low, high, (n // 2, 7)),
1200+
columns=list('ABCDEFG'))
1201+
1202+
# add duplicates & overlap with left to the right frame
1203+
i = np.random.choice(len(left), n)
1204+
right = pd.concat([right, right, left.iloc[i]], ignore_index=True)
1205+
1206+
left['left'] = np.random.randn(len(left))
1207+
right['right'] = np.random.randn(len(right))
1208+
1209+
# shuffle left & right frames
1210+
i = np.random.permutation(len(left))
1211+
left = left.iloc[i].copy()
1212+
left.index = np.arange(len(left))
1213+
1214+
i = np.random.permutation(len(right))
1215+
right = right.iloc[i].copy()
1216+
right.index = np.arange(len(right))
1217+
1218+
# manually compute outer merge
1219+
ldict, rdict = defaultdict(list), defaultdict(list)
1220+
1221+
for idx, row in left.set_index(list('ABCDEFG')).iterrows():
1222+
ldict[idx].append(row['left'])
1223+
1224+
for idx, row in right.set_index(list('ABCDEFG')).iterrows():
1225+
rdict[idx].append(row['right'])
1226+
1227+
vals = []
1228+
for k, lval in ldict.items():
1229+
rval = rdict.get(k, [np.nan])
1230+
for lv, rv in product(lval, rval):
1231+
vals.append(k + tuple([lv, rv]))
1232+
1233+
for k, rval in rdict.items():
1234+
if k not in ldict:
1235+
for rv in rval:
1236+
vals.append(k + tuple([np.nan, rv]))
1237+
1238+
def align(df):
1239+
df = df.sort(df.columns.tolist())
1240+
df.index = np.arange(len(df))
1241+
return df
1242+
1243+
def verify_order(df):
1244+
kcols = list('ABCDEFG')
1245+
assert_frame_equal(df[kcols].copy(),
1246+
df[kcols].sort(kcols, kind='mergesort'))
1247+
1248+
out = DataFrame(vals, columns=list('ABCDEFG') + ['left', 'right'])
1249+
out = align(out)
1250+
1251+
jmask = {'left': out['left'].notnull(),
1252+
'right': out['right'].notnull(),
1253+
'inner': out['left'].notnull() & out['right'].notnull(),
1254+
'outer': np.ones(len(out), dtype='bool')}
1255+
1256+
for how in 'left', 'right', 'outer', 'inner':
1257+
mask = jmask[how]
1258+
frame = align(out[mask].copy())
1259+
self.assertTrue(mask.all() ^ mask.any() or how == 'outer')
1260+
1261+
for sort in [False, True]:
1262+
res = merge(left, right, how=how, sort=sort)
1263+
if sort:
1264+
verify_order(res)
1265+
1266+
# as in GH9092 dtypes break with outer/right join
1267+
assert_frame_equal(frame, align(res),
1268+
check_dtype=how not in ('right', 'outer'))
1269+
1270+
11541271
def test_join_multi_levels(self):
11551272

11561273
# GH 3662

vb_suite/join_merge.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -249,4 +249,22 @@ def sample(values, k):
249249
columns=['jolie', 'jolia']).set_index('jolie')
250250
'''
251251

252-
left_outer_join_index = Benchmark("left.join(right, on='jim')", setup)
252+
left_outer_join_index = Benchmark("left.join(right, on='jim')", setup,
253+
name='left_outer_join_index')
254+
255+
256+
setup = common_setup + """
257+
low, high, n = -1 << 10, 1 << 10, 1 << 20
258+
left = DataFrame(np.random.randint(low, high, (n, 7)),
259+
columns=list('ABCDEFG'))
260+
left['left'] = left.sum(axis=1)
261+
262+
i = np.random.permutation(len(left))
263+
right = left.iloc[i].copy()
264+
right.columns = right.columns[:-1].tolist() + ['right']
265+
right.index = np.arange(len(right))
266+
right['right'] *= -1
267+
"""
268+
269+
i8merge = Benchmark("merge(left, right, how='outer')", setup,
270+
name='i8merge')

0 commit comments

Comments
 (0)