ENH: left/right merge operations working and fairly fast, #249

wesm · wesm · commit e42ecc2d72b2 · 2011-12-29T20:56:17.000-05:00
diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py
@@ -7,6 +7,7 @@
 from pandas.core.frame import DataFrame
 from pandas.core.index import Index
 from pandas.core.internals import _JoinOperation
+import pandas.core.common as com
 
 import pandas._tseries as lib
 from pandas._sandbox import Factorizer
@@ -108,11 +109,25 @@ def get_result(self):
 
         new_axis = Index(np.arange(len(left_indexer)))
 
+        # TODO: more efficiently handle group keys to avoid extra consolidation!
+
         join_op = _JoinOperation(ldata, rdata, new_axis,
                                  left_indexer, right_indexer, axis=1)
 
         result_data = join_op.get_result(copy=self.copy)
-        return DataFrame(result_data)
+        result = DataFrame(result_data)
+
+        # insert group keys
+        for i, name in enumerate(join_names):
+            # a faster way?
+            key_col = com.take_1d(left_join_keys[i], left_indexer)
+            na_indexer = (left_indexer == -1).nonzero()[0]
+            right_na_indexer = right_indexer.take(na_indexer)
+            key_col.put(na_indexer, com.take_1d(right_join_keys[i],
+                                                right_na_indexer))
+            result.insert(i, name, key_col)
+
+        return result
 
     def _get_merge_data(self, join_names):
         """
@@ -148,8 +163,8 @@ def _get_merge_keys(self):
         right_keys = []
         join_names = []
 
-        need_set_names = False
-        pop_right = False
+        # need_set_names = False
+        # pop_right = False
 
         if (self.on is None and self.left_on is None
             and self.right_on is None):
@@ -158,7 +173,8 @@ def _get_merge_keys(self):
                 left_keys.append(self.left.index.values)
                 right_keys.append(self.right.index.values)
 
-                need_set_names = True
+                # need_set_names = True
+
                 # XXX something better than this
                 join_names.append('join_key')
             elif self.left_index:
@@ -173,30 +189,30 @@ def _get_merge_keys(self):
                 # use the common columns
                 common_cols = self.left.columns.intersection(self.right.columns)
                 self.left_on = self.right_on = common_cols
-                pop_right = True
+
+                # pop_right = True
+
         elif self.on is not None:
             if self.left_on is not None or self.right_on is not None:
                 raise Exception('Can only pass on OR left_on and '
                                 'right_on')
             self.left_on = self.right_on = self.on
-            pop_right = True
 
-        if self.right_on is not None:
-            # this is a touch kludgy, but accomplishes the goal
-            if pop_right:
-                right = self.right.copy()
-                right_keys.extend([right.pop(k) for k in self.right_on])
-                self.right = right
-            else:
-                right_keys.extend([right[k] for k in self.right_on])
+            # pop_right = True
 
-        if need_set_names:
-            self.left = self.left.copy()
-            for i, (lkey, name) in enumerate(zip(left_keys, join_names)):
-                self.left.insert(i, name, lkey)
+        # this is a touch kludgy, but accomplishes the goal
+        if self.right_on is not None:
+            right = self.right.copy()
+            right_keys.extend([right.pop(k) for k in self.right_on])
+            self.right = right
 
         if self.left_on is not None:
-            left_keys.extend([self.left[k] for k in self.left_on])
+            left = self.left.copy()
+            left_keys.extend([left.pop(k) for k in self.left_on])
+            self.left = left
+
+            # TODO: something else?
+            join_names = self.left_on
 
         return left_keys, right_keys, join_names
 
@@ -253,8 +269,8 @@ def _maybe_make_list(obj):
         return [obj]
     return obj
 
-def _right_outer_join(x, y):
-    right_indexer, left_indexer = sbx.left_outer_join(y, x)
+def _right_outer_join(x, y, max_groups):
+    right_indexer, left_indexer = sbx.left_outer_join(y, x, max_groups)
     return left_indexer, right_indexer
 
 _join_functions = {
diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py
@@ -2,16 +2,44 @@
 import unittest
 
 import numpy as np
+import random
 
+from pandas import *
 from pandas.tools.merge import merge
 import pandas._sandbox as sbx
 
 a_ = np.array
 
+N = 100
+NGROUPS = 8
+
+def get_test_data(ngroups=NGROUPS, n=N):
+    unique_groups = range(ngroups)
+    arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object)
+
+    if len(arr) < n:
+        arr = np.asarray(list(arr) + unique_groups[:n - len(arr)],
+                         dtype=object)
+
+    random.shuffle(arr)
+    return arr
+
 class TestMerge(unittest.TestCase):
 
     def setUp(self):
-        pass
+        # aggregate multiple columns
+        self.df = DataFrame({'key1' : get_test_data(),
+                             'key2' : get_test_data(),
+                             'data1' : np.random.randn(N),
+                             'data2' : np.random.randn(N)})
+
+        # exclude a couple keys for fun
+        self.df = self.df[self.df['key2'] > 1]
+
+        self.df2 = DataFrame({'key1'  : get_test_data(n=N//5),
+                              'key2'  : get_test_data(ngroups=NGROUPS//2,
+                                                      n=N//5),
+                              'value' : np.random.randn(N // 5)})
 
     def test_cython_left_outer_join(self):
         left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype='i4')
@@ -92,7 +120,31 @@ def test_cython_inner_join(self):
     def test_cython_full_outer_join(self):
         pass
 
-    def test_left_join(self):
+    def test_left_outer_join(self):
+        joined_key2 = merge(self.df, self.df2, on='key2')
+        _check_join(self.df, self.df2, joined_key2, ['key2'], how='left')
+
+        joined_both = merge(self.df, self.df2)
+        _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
+                    how='left')
+
+    def test_right_outer_join(self):
+        joined_key2 = merge(self.df, self.df2, on='key2', how='right')
+        _check_join(self.df, self.df2, joined_key2, ['key2'], how='right')
+
+        joined_both = merge(self.df, self.df2, how='right')
+        _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
+                    how='right')
+
+    # def test_full_outer_join(self):
+    #     joined_key2 = merge(self.df, self.df2, on='key2', how='outer')
+    #     _check_join(self.df, self.df2, joined_key2, ['key2'], how='outer')
+
+    #     joined_both = merge(self.df, self.df2, how='outer')
+    #     _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
+    #                 how='outer')
+
+    def test_handle_overlap(self):
         pass
 
     def test_merge_common(self):
@@ -101,6 +153,74 @@ def test_merge_common(self):
     def test_merge_index(self):
         pass
 
+def _check_join(left, right, result, join_col, how='left',
+                lsuffix='.x', rsuffix='.y'):
+
+    # some smoke tests
+    for c in join_col:
+        assert(result[c].notnull().all())
+
+    left_grouped = left.groupby(join_col)
+    right_grouped = right.groupby(join_col)
+
+    for group_key, group in result.groupby(join_col):
+        l_joined = _restrict_to_columns(group, left.columns, lsuffix)
+        r_joined = _restrict_to_columns(group, right.columns, rsuffix)
+
+        try:
+            lgroup = left_grouped.get_group(group_key)
+        except KeyError:
+            if how == 'left':
+                raise AssertionError('key %s should not have been in the join'
+                                     % str(group_key))
+
+            _assert_all_na(l_joined, left.columns, join_col)
+        else:
+            _assert_same_contents(l_joined, lgroup)
+
+        try:
+            rgroup = right_grouped.get_group(group_key)
+        except KeyError:
+            if how == 'right':
+                raise AssertionError('key %s should not have been in the join'
+                                     % str(group_key))
+
+            _assert_all_na(r_joined, right.columns, join_col)
+        else:
+            _assert_same_contents(r_joined, rgroup)
+
+
+def _restrict_to_columns(group, columns, suffix):
+    found = [c for c in group.columns
+             if c in columns or c.replace(suffix, '') in columns]
+
+     # filter
+    group = group.ix[:, found]
+
+    # get rid of suffixes, if any
+    group = group.rename(columns=lambda x: x.replace(suffix, ''))
+
+    # put in the right order...
+    group = group.ix[:, columns]
+
+    return group
+
+def _assert_same_contents(join_chunk, source):
+    NA_SENTINEL = -1234567 # drop_duplicates not so NA-friendly...
+
+    jvalues = join_chunk.fillna(NA_SENTINEL).drop_duplicates().values
+    svalues = source.fillna(NA_SENTINEL).drop_duplicates().values
+
+    rows = set(tuple(row) for row in jvalues)
+    assert(len(rows) == len(source))
+    assert(all(tuple(row) in rows for row in svalues))
+
+def _assert_all_na(join_chunk, source_columns, join_col):
+    for c in source_columns:
+        if c in join_col:
+            continue
+        assert(join_chunk[c].isnull().all())
+
 if __name__ == '__main__':
     import nose
     nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'],