BUG: Allow multiple 'by' parameters in merge_asof() when DataFrames are indexed (pandas-dev#15676)

Christopher C. Aycock · jreback · commit 2621b31c7dbd · 2017-03-14T10:05:41.000-04:00
closes pandas-dev#15676 Author: Christopher C. Aycock <christopher.aycock@twosigma.com> Closes pandas-dev#15679 from chrisaycock/GH15676 and squashes the following commits: 965caf2 [Christopher C. Aycock] Verify that 'by' parameters are the same length 4a2cc09 [Christopher C. Aycock] BUG: Allow multiple 'by' parameters in merge_asof() when DataFrames are indexed (pandas-dev#15676)
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -884,6 +884,7 @@ Bug Fixes
 - Bug in the HTML display with with a ``MultiIndex`` and truncation (:issue:`14882`)
 
 
+- Bug in ``pd.merge_asof()`` where ``left_index`` or ``right_index`` caused a failure when multiple ``by`` was specified (:issue:`15676`)
 - Bug in ``pd.merge_asof()`` where ``left_index``/``right_index`` together caused a failure when ``tolerance`` was specified (:issue:`15135`)
 - Bug in ``DataFrame.pivot_table()`` where ``dropna=True`` would not drop all-NaN columns when the columns was a ``category`` dtype (:issue:`15193`)
 
diff --git a/pandas/tests/tools/test_merge_asof.py b/pandas/tests/tools/test_merge_asof.py
@@ -368,6 +368,41 @@ def test_multiby_heterogeneous_types(self):
                                by=['ticker', 'exch'])
         assert_frame_equal(result, expected)
 
+    def test_multiby_indexed(self):
+        # GH15676
+        left = pd.DataFrame([
+            [pd.to_datetime('20160602'), 1, 'a'],
+            [pd.to_datetime('20160602'), 2, 'a'],
+            [pd.to_datetime('20160603'), 1, 'b'],
+            [pd.to_datetime('20160603'), 2, 'b']],
+            columns=['time', 'k1', 'k2']).set_index('time')
+
+        right = pd.DataFrame([
+            [pd.to_datetime('20160502'), 1, 'a', 1.0],
+            [pd.to_datetime('20160502'), 2, 'a', 2.0],
+            [pd.to_datetime('20160503'), 1, 'b', 3.0],
+            [pd.to_datetime('20160503'), 2, 'b', 4.0]],
+            columns=['time', 'k1', 'k2', 'value']).set_index('time')
+
+        expected = pd.DataFrame([
+            [pd.to_datetime('20160602'), 1, 'a', 1.0],
+            [pd.to_datetime('20160602'), 2, 'a', 2.0],
+            [pd.to_datetime('20160603'), 1, 'b', 3.0],
+            [pd.to_datetime('20160603'), 2, 'b', 4.0]],
+            columns=['time', 'k1', 'k2', 'value']).set_index('time')
+
+        result = pd.merge_asof(left,
+                               right,
+                               left_index=True,
+                               right_index=True,
+                               by=['k1', 'k2'])
+
+        assert_frame_equal(expected, result)
+
+        with self.assertRaises(MergeError):
+            pd.merge_asof(left, right, left_index=True, right_index=True,
+                          left_by=['k1', 'k2'], right_by=['k1'])
+
     def test_basic2(self):
 
         expected = self.read_data('asof2.csv')
diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py
@@ -1165,14 +1165,17 @@ def _validate_specification(self):
         if self.left_by is not None and self.right_by is None:
             raise MergeError('missing right_by')
 
-        # add by to our key-list so we can have it in the
+        # add 'by' to our key-list so we can have it in the
         # output as a key
         if self.left_by is not None:
             if not is_list_like(self.left_by):
                 self.left_by = [self.left_by]
             if not is_list_like(self.right_by):
                 self.right_by = [self.right_by]
 
+            if len(self.left_by) != len(self.right_by):
+                raise MergeError('left_by and right_by must be same length')
+
             self.left_on = self.left_by + list(self.left_on)
             self.right_on = self.right_by + list(self.right_on)
 
@@ -1264,13 +1267,21 @@ def flip(xs):
 
         # a "by" parameter requires special handling
         if self.left_by is not None:
-            if len(self.left_join_keys) > 2:
-                # get tuple representation of values if more than one
-                left_by_values = flip(self.left_join_keys[0:-1])
-                right_by_values = flip(self.right_join_keys[0:-1])
+            # remove 'on' parameter from values if one existed
+            if self.left_index and self.right_index:
+                left_by_values = self.left_join_keys
+                right_by_values = self.right_join_keys
+            else:
+                left_by_values = self.left_join_keys[0:-1]
+                right_by_values = self.right_join_keys[0:-1]
+
+            # get tuple representation of values if more than one
+            if len(left_by_values) == 1:
+                left_by_values = left_by_values[0]
+                right_by_values = right_by_values[0]
             else:
-                left_by_values = self.left_join_keys[0]
-                right_by_values = self.right_join_keys[0]
+                left_by_values = flip(left_by_values)
+                right_by_values = flip(right_by_values)
 
             # upcast 'by' parameter because HashTable is limited
             by_type = _get_cython_type_upcast(left_by_values.dtype)