improves merge performance when key space exceeds i8 bounds

behzadnouri · behzadnouri · commit 7ff782b787c5 · 2014-12-28T15:51:53.000-05:00
diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt
@@ -68,6 +68,7 @@ Performance
 - Performance improvements in ``MultiIndex.duplicated`` by working with labels instead of values (:issue:`9125`)
 - Improved the speed of `nunique` by calling `unique` instead of `value_counts` (:issue:`9129`, :issue:`7771`)
 - Performance improvement of up to 10x in ``DataFrame.count`` and ``DataFrame.dropna`` by taking advantage of homogeneous/heterogeneous dtypes appropriately (:issue:`9136`)
+- Performance and memory usage improvements in ``merge`` when key space exceeds ``int64`` bounds (:issue:`9151`)
 
 Bug Fixes
 ~~~~~~~~~
diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py
@@ -4,7 +4,7 @@
 import types
 
 import numpy as np
-from pandas.compat import range, long, lrange, lzip, zip
+from pandas.compat import range, long, lrange, lzip, zip, map, filter
 import pandas.compat as compat
 from pandas.core.categorical import Categorical
 from pandas.core.frame import DataFrame, _merge_doc
@@ -450,39 +450,29 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner'):
     -------
 
     """
-    if len(left_keys) != len(right_keys):
-        raise AssertionError('left_key and right_keys must be the same length')
+    from functools import partial
 
-    left_labels = []
-    right_labels = []
-    group_sizes = []
+    assert len(left_keys) == len(right_keys), \
+            'left_key and right_keys must be the same length'
 
-    for lk, rk in zip(left_keys, right_keys):
-        llab, rlab, count = _factorize_keys(lk, rk, sort=sort)
+    # bind `sort` arg. of _factorize_keys
+    fkeys = partial(_factorize_keys, sort=sort)
 
-        left_labels.append(llab)
-        right_labels.append(rlab)
-        group_sizes.append(count)
+    # get left & right join labels and num. of levels at each location
+    llab, rlab, shape = map(list, zip( * map(fkeys, left_keys, right_keys)))
 
-    max_groups = long(1)
-    for x in group_sizes:
-        max_groups *= long(x)
+    # get flat i8 keys from label lists
+    lkey, rkey = _get_join_keys(llab, rlab, shape, sort)
 
-    if max_groups > 2 ** 63:  # pragma: no cover
-        left_group_key, right_group_key, max_groups = \
-            _factorize_keys(lib.fast_zip(left_labels),
-                            lib.fast_zip(right_labels))
-    else:
-        left_group_key = get_group_index(left_labels, group_sizes)
-        right_group_key = get_group_index(right_labels, group_sizes)
-
-        left_group_key, right_group_key, max_groups = \
-            _factorize_keys(left_group_key, right_group_key, sort=sort)
+    # factorize keys to a dense i8 space
+    # `count` is the num. of unique keys
+    # set(lkey) | set(rkey) == range(count)
+    lkey, rkey, count = fkeys(lkey, rkey)
 
     # preserve left frame order if how == 'left' and sort == False
     kwargs = {'sort':sort} if how == 'left' else {}
     join_func = _join_functions[how]
-    return join_func(left_group_key, right_group_key, max_groups, **kwargs)
+    return join_func(lkey, rkey, count, **kwargs)
 
 
 class _OrderedMerge(_MergeOperation):
@@ -590,9 +580,9 @@ def _left_join_on_index(left_ax, right_ax, join_keys, sort=False):
         # if asked to sort or there are 1-to-many matches
         join_index = left_ax.take(left_indexer)
         return join_index, left_indexer, right_indexer
-    else:
-        # left frame preserves order & length of its index
-        return left_ax, None, right_indexer
+
+    # left frame preserves order & length of its index
+    return left_ax, None, right_indexer
 
 
 def _right_outer_join(x, y, max_groups):
@@ -663,6 +653,35 @@ def _sort_labels(uniques, left, right):
     return new_left, new_right
 
 
+def _get_join_keys(llab, rlab, shape, sort):
+    from pandas.core.groupby import _int64_overflow_possible
+
+    # how many levels can be done without overflow
+    pred = lambda i: not _int64_overflow_possible(shape[:i])
+    nlev = next(filter(pred, range(len(shape), 0, -1)))
+
+    # get keys for the first `nlev` levels
+    stride = np.prod(shape[1:nlev], dtype='i8')
+    lkey = stride * llab[0].astype('i8', subok=False, copy=False)
+    rkey = stride * rlab[0].astype('i8', subok=False, copy=False)
+
+    for i in range(1, nlev):
+        stride //= shape[i]
+        lkey += llab[i] * stride
+        rkey += rlab[i] * stride
+
+    if nlev == len(shape):  # all done!
+        return lkey, rkey
+
+    # densify current keys to avoid overflow
+    lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)
+
+    llab = [lkey] + llab[nlev:]
+    rlab = [rkey] + rlab[nlev:]
+    shape = [count] + shape[nlev:]
+
+    return _get_join_keys(llab, rlab, shape, sort)
+
 #----------------------------------------------------------------------
 # Concatenate DataFrame objects
 
diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py
@@ -1141,6 +1141,10 @@ def test_merge_na_keys(self):
         tm.assert_frame_equal(result, expected)
 
     def test_int64_overflow_issues(self):
+        from itertools import product
+        from collections import defaultdict
+        from pandas.core.groupby import _int64_overflow_possible
+
         # #2690, combinatorial explosion
         df1 = DataFrame(np.random.randn(1000, 7),
                         columns=list('ABCDEF') + ['G1'])
@@ -1151,6 +1155,119 @@ def test_int64_overflow_issues(self):
         result = merge(df1, df2, how='outer')
         self.assertTrue(len(result) == 2000)
 
+        low, high, n = -1 << 10, 1 << 10, 1 << 20
+        left = DataFrame(np.random.randint(low, high, (n, 7)),
+                         columns=list('ABCDEFG'))
+        left['left'] = left.sum(axis=1)
+
+        # one-2-one match
+        i = np.random.permutation(len(left))
+        right = left.iloc[i].copy()
+        right.columns = right.columns[:-1].tolist() + ['right']
+        right.index = np.arange(len(right))
+        right['right'] *= -1
+
+        out = merge(left, right, how='outer')
+        self.assertEqual(len(out), len(left))
+        assert_series_equal(out['left'], - out['right'])
+        assert_series_equal(out['left'], out.iloc[:, :-2].sum(axis=1))
+
+        out.sort(out.columns.tolist(), inplace=True)
+        out.index = np.arange(len(out))
+        for how in ['left', 'right', 'outer', 'inner']:
+            assert_frame_equal(out, merge(left, right, how=how, sort=True))
+
+        # check that left merge w/ sort=False maintains left frame order
+        out = merge(left, right, how='left', sort=False)
+        assert_frame_equal(left, out[left.columns.tolist()])
+
+        out = merge(right, left, how='left', sort=False)
+        assert_frame_equal(right, out[right.columns.tolist()])
+
+        # one-2-many/none match
+        n = 1 << 11
+        left = DataFrame(np.random.randint(low, high, (n, 7)),
+                         columns=list('ABCDEFG'))
+
+        # confirm that this is checking what it is supposed to check
+        shape = left.apply(pd.Series.nunique).values
+        self.assertTrue(_int64_overflow_possible(shape))
+
+        # add duplicates to left frame
+        left = pd.concat([left, left], ignore_index=True)
+
+        right = DataFrame(np.random.randint(low, high, (n // 2, 7)),
+                          columns=list('ABCDEFG'))
+
+        # add duplicates & overlap with left to the right frame
+        i = np.random.choice(len(left), n)
+        right = pd.concat([right, right, left.iloc[i]], ignore_index=True)
+
+        left['left'] = np.random.randn(len(left))
+        right['right'] = np.random.randn(len(right))
+
+        # shuffle left & right frames
+        i = np.random.permutation(len(left))
+        left = left.iloc[i].copy()
+        left.index = np.arange(len(left))
+
+        i = np.random.permutation(len(right))
+        right = right.iloc[i].copy()
+        right.index = np.arange(len(right))
+
+        # manually compute outer merge
+        ldict, rdict = defaultdict(list), defaultdict(list)
+
+        for idx, row in left.set_index(list('ABCDEFG')).iterrows():
+            ldict[idx].append(row['left'])
+
+        for idx, row in right.set_index(list('ABCDEFG')).iterrows():
+            rdict[idx].append(row['right'])
+
+        vals = []
+        for k, lval in ldict.items():
+            rval = rdict.get(k, [np.nan])
+            for lv, rv in product(lval, rval):
+                vals.append(k + tuple([lv, rv]))
+
+        for k, rval in rdict.items():
+            if k not in ldict:
+                for rv in rval:
+                    vals.append(k + tuple([np.nan, rv]))
+
+        def align(df):
+            df = df.sort(df.columns.tolist())
+            df.index = np.arange(len(df))
+            return df
+
+        def verify_order(df):
+            kcols = list('ABCDEFG')
+            assert_frame_equal(df[kcols].copy(),
+                               df[kcols].sort(kcols, kind='mergesort'))
+
+        out = DataFrame(vals, columns=list('ABCDEFG') + ['left', 'right'])
+        out = align(out)
+
+        jmask = {'left': out['left'].notnull(),
+                 'right': out['right'].notnull(),
+                 'inner': out['left'].notnull() & out['right'].notnull(),
+                 'outer': np.ones(len(out), dtype='bool')}
+
+        for how in 'left', 'right', 'outer', 'inner':
+            mask = jmask[how]
+            frame = align(out[mask].copy())
+            self.assertTrue(mask.all() ^ mask.any() or how == 'outer')
+
+            for sort in [False, True]:
+                res = merge(left, right, how=how, sort=sort)
+                if sort:
+                    verify_order(res)
+
+                # as in GH9092 dtypes break with outer/right join
+                assert_frame_equal(frame, align(res),
+                                   check_dtype=how not in ('right', 'outer'))
+
+
     def test_join_multi_levels(self):
 
         # GH 3662
diff --git a/vb_suite/join_merge.py b/vb_suite/join_merge.py
@@ -249,4 +249,22 @@ def sample(values, k):
         columns=['jolie', 'jolia']).set_index('jolie')
 '''
 
-left_outer_join_index = Benchmark("left.join(right, on='jim')", setup)
+left_outer_join_index = Benchmark("left.join(right, on='jim')", setup,
+                                  name='left_outer_join_index')
+
+
+setup = common_setup + """
+low, high, n = -1 << 10, 1 << 10, 1 << 20
+left = DataFrame(np.random.randint(low, high, (n, 7)),
+                    columns=list('ABCDEFG'))
+left['left'] = left.sum(axis=1)
+
+i = np.random.permutation(len(left))
+right = left.iloc[i].copy()
+right.columns = right.columns[:-1].tolist() + ['right']
+right.index = np.arange(len(right))
+right['right'] *= -1
+"""
+
+i8merge = Benchmark("merge(left, right, how='outer')", setup,
+                    name='i8merge')