Skip to content

Commit 84056c5

Browse files
meeseeksmachinejreback
authored andcommitted
Backport PR pandas-dev#24916: BUG-24212 fix regression in pandas-dev#24897 (pandas-dev#24951)
1 parent 31dcbb7 commit 84056c5

File tree

3 files changed

+60
-19
lines changed

3 files changed

+60
-19
lines changed

doc/source/whatsnew/v0.24.1.rst

+3
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,9 @@ Bug Fixes
6363
-
6464
-
6565

66+
**Reshaping**
67+
68+
- Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (:issue:`24212`)
6669

6770
**Other**
6871

pandas/core/reshape/merge.py

+43-2
Original file line numberDiff line numberDiff line change
@@ -757,13 +757,21 @@ def _get_join_info(self):
757757

758758
if self.right_index:
759759
if len(self.left) > 0:
760-
join_index = self.left.index.take(left_indexer)
760+
join_index = self._create_join_index(self.left.index,
761+
self.right.index,
762+
left_indexer,
763+
right_indexer,
764+
how='right')
761765
else:
762766
join_index = self.right.index.take(right_indexer)
763767
left_indexer = np.array([-1] * len(join_index))
764768
elif self.left_index:
765769
if len(self.right) > 0:
766-
join_index = self.right.index.take(right_indexer)
770+
join_index = self._create_join_index(self.right.index,
771+
self.left.index,
772+
right_indexer,
773+
left_indexer,
774+
how='left')
767775
else:
768776
join_index = self.left.index.take(left_indexer)
769777
right_indexer = np.array([-1] * len(join_index))
@@ -774,6 +782,39 @@ def _get_join_info(self):
774782
join_index = join_index.astype(object)
775783
return join_index, left_indexer, right_indexer
776784

785+
def _create_join_index(self, index, other_index, indexer,
786+
other_indexer, how='left'):
787+
"""
788+
Create a join index by rearranging one index to match another
789+
790+
Parameters
791+
----------
792+
index: Index being rearranged
793+
other_index: Index used to supply values not found in index
794+
indexer: how to rearrange index
795+
how: replacement is only necessary if indexer based on other_index
796+
797+
Returns
798+
-------
799+
join_index
800+
"""
801+
join_index = index.take(indexer)
802+
if (self.how in (how, 'outer') and
803+
not isinstance(other_index, MultiIndex)):
804+
# if final index requires values in other_index but not target
805+
# index, indexer may hold missing (-1) values, causing Index.take
806+
# to take the final value in target index
807+
mask = indexer == -1
808+
if np.any(mask):
809+
# if values missing (-1) from target index,
810+
# take from other_index instead
811+
join_list = join_index.to_numpy()
812+
other_list = other_index.take(other_indexer).to_numpy()
813+
join_list[mask] = other_list[mask]
814+
join_index = Index(join_list, dtype=join_index.dtype,
815+
name=join_index.name)
816+
return join_index
817+
777818
def _get_merge_keys(self):
778819
"""
779820
Note: has side effects (copy/delete key columns)

pandas/tests/reshape/merge/test_merge.py

+14-17
Original file line numberDiff line numberDiff line change
@@ -939,25 +939,22 @@ def test_merge_two_empty_df_no_division_error(self):
939939
with np.errstate(divide='raise'):
940940
merge(a, a, on=('a', 'b'))
941941

942-
@pytest.mark.parametrize('how', ['left', 'outer'])
943-
@pytest.mark.xfail(reason="GH-24897")
942+
@pytest.mark.parametrize('how', ['right', 'outer'])
944943
def test_merge_on_index_with_more_values(self, how):
945944
# GH 24212
946-
# pd.merge gets [-1, -1, 0, 1] as right_indexer, ensure that -1 is
947-
# interpreted as a missing value instead of the last element
948-
df1 = pd.DataFrame([[1, 2], [2, 4], [3, 6], [4, 8]],
949-
columns=['a', 'b'])
950-
df2 = pd.DataFrame([[3, 30], [4, 40]],
951-
columns=['a', 'c'])
952-
df1.set_index('a', drop=False, inplace=True)
953-
df2.set_index('a', inplace=True)
954-
result = pd.merge(df1, df2, left_index=True, right_on='a', how=how)
955-
expected = pd.DataFrame([[1, 2, np.nan],
956-
[2, 4, np.nan],
957-
[3, 6, 30.0],
958-
[4, 8, 40.0]],
959-
columns=['a', 'b', 'c'])
960-
expected.set_index('a', drop=False, inplace=True)
945+
# pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that
946+
# -1 is interpreted as a missing value instead of the last element
947+
df1 = pd.DataFrame({'a': [1, 2, 3], 'key': [0, 2, 2]})
948+
df2 = pd.DataFrame({'b': [1, 2, 3, 4, 5]})
949+
result = df1.merge(df2, left_on='key', right_index=True, how=how)
950+
expected = pd.DataFrame([[1.0, 0, 1],
951+
[2.0, 2, 3],
952+
[3.0, 2, 3],
953+
[np.nan, 1, 2],
954+
[np.nan, 3, 4],
955+
[np.nan, 4, 5]],
956+
columns=['a', 'key', 'b'])
957+
expected.set_index(Int64Index([0, 1, 2, 1, 3, 4]), inplace=True)
961958
assert_frame_equal(result, expected)
962959

963960
def test_merge_right_index_right(self):

0 commit comments

Comments
 (0)