diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 7e69a8044a305..2e39ef5f0ce76 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -660,6 +660,8 @@ Bug Fixes - Bug in ``io.common.get_filepath_or_buffer`` which caused reading of valid S3 files to fail if the bucket also contained keys for which the user does not have read permission (:issue:`10604`) - Bug in vectorised setting of timestamp columns with python ``datetime.date`` and numpy ``datetime64`` (:issue:`10408`, :issue:`10412`) - Bug in ``Index.take`` may add unnecessary ``freq`` attribute (:issue:`10791`) +- Bug in ``merge`` with empty ``DataFrame`` may raise ``IndexError`` (:issue:`10824`) + - Bug in ``read_csv`` when using the ``nrows`` or ``chunksize`` parameters if file contains only a header line (:issue:`9535`) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 5ee774635e59e..a8b0d37b55bfe 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -217,6 +217,9 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): if left_indexer is not None and right_indexer is not None: if name in self.left: + if len(self.left) == 0: + continue + na_indexer = (left_indexer == -1).nonzero()[0] if len(na_indexer) == 0: continue @@ -226,6 +229,9 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): na_indexer, com.take_1d(self.right_join_keys[i], right_na_indexer)) elif name in self.right: + if len(self.right) == 0: + continue + na_indexer = (right_indexer == -1).nonzero()[0] if len(na_indexer) == 0: continue @@ -270,9 +276,17 @@ def _get_join_info(self): sort=self.sort, how=self.how) if self.right_index: - join_index = self.left.index.take(left_indexer) + if len(self.left) > 0: + join_index = self.left.index.take(left_indexer) + else: + join_index = self.right.index.take(right_indexer) + left_indexer = np.array([-1] * len(join_index)) elif self.left_index: - join_index = self.right.index.take(right_indexer) + if len(self.right) > 0: + join_index = self.right.index.take(right_indexer) + else: + join_index = self.left.index.take(left_indexer) + right_indexer = np.array([-1] * len(join_index)) else: join_index = Index(np.arange(len(left_indexer))) diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 236157d028db3..ee83b9632bd4b 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -737,6 +737,88 @@ def test_left_merge_empty_dataframe(self): result = merge(right, left, on='key', how='right') assert_frame_equal(result, left) + def test_merge_left_empty_right_empty(self): + # GH 10824 + left = pd.DataFrame([], columns=['a', 'b', 'c']) + right = pd.DataFrame([], columns=['x', 'y', 'z']) + + exp_in = pd.DataFrame([], columns=['a', 'b', 'c', 'x', 'y', 'z'], + dtype=object) + + for kwarg in [dict(left_index=True, right_index=True), + dict(left_index=True, right_on='x'), + dict(left_on='a', right_index=True), + dict(left_on='a', right_on='x')]: + + result = pd.merge(left, right, how='inner', **kwarg) + tm.assert_frame_equal(result, exp_in) + result = pd.merge(left, right, how='left', **kwarg) + tm.assert_frame_equal(result, exp_in) + result = pd.merge(left, right, how='right', **kwarg) + tm.assert_frame_equal(result, exp_in) + result = pd.merge(left, right, how='outer', **kwarg) + tm.assert_frame_equal(result, exp_in) + + def test_merge_left_empty_right_notempty(self): + # GH 10824 + left = pd.DataFrame([], columns=['a', 'b', 'c']) + right = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + columns=['x', 'y', 'z']) + + exp_out = pd.DataFrame({'a': np.array([np.nan]*3, dtype=object), + 'b': np.array([np.nan]*3, dtype=object), + 'c': np.array([np.nan]*3, dtype=object), + 'x': [1, 4, 7], + 'y': [2, 5, 8], + 'z': [3, 6, 9]}, + columns=['a', 'b', 'c', 'x', 'y', 'z']) + exp_in = exp_out[0:0] # make empty DataFrame keeping dtype + + for kwarg in [dict(left_index=True, right_index=True), + dict(left_index=True, right_on='x'), + dict(left_on='a', right_index=True), + dict(left_on='a', right_on='x')]: + + result = pd.merge(left, right, how='inner', **kwarg) + tm.assert_frame_equal(result, exp_in) + result = pd.merge(left, right, how='left', **kwarg) + tm.assert_frame_equal(result, exp_in) + + result = pd.merge(left, right, how='right', **kwarg) + tm.assert_frame_equal(result, exp_out) + result = pd.merge(left, right, how='outer', **kwarg) + tm.assert_frame_equal(result, exp_out) + + def test_merge_left_notempty_right_empty(self): + # GH 10824 + left = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + columns=['a', 'b', 'c']) + right = pd.DataFrame([], columns=['x', 'y', 'z']) + + exp_out = pd.DataFrame({'a': [1, 4, 7], + 'b': [2, 5, 8], + 'c': [3, 6, 9], + 'x': np.array([np.nan]*3, dtype=object), + 'y': np.array([np.nan]*3, dtype=object), + 'z': np.array([np.nan]*3, dtype=object)}, + columns=['a', 'b', 'c', 'x', 'y', 'z']) + exp_in = exp_out[0:0] # make empty DataFrame keeping dtype + + for kwarg in [dict(left_index=True, right_index=True), + dict(left_index=True, right_on='x'), + dict(left_on='a', right_index=True), + dict(left_on='a', right_on='x')]: + + result = pd.merge(left, right, how='inner', **kwarg) + tm.assert_frame_equal(result, exp_in) + result = pd.merge(left, right, how='right', **kwarg) + tm.assert_frame_equal(result, exp_in) + + result = pd.merge(left, right, how='left', **kwarg) + tm.assert_frame_equal(result, exp_out) + result = pd.merge(left, right, how='outer', **kwarg) + tm.assert_frame_equal(result, exp_out) + def test_merge_nosort(self): # #2098, anything to do?