Skip to content

Commit d716feb

Browse files
JustinZhengBCjreback
authored andcommitted
BUG-24212 fix usage of Index.take in pd.merge (#24733)
1 parent 03001be commit d716feb

File tree

3 files changed

+60
-2
lines changed

3 files changed

+60
-2
lines changed

doc/source/whatsnew/v0.24.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1817,6 +1817,7 @@ Reshaping
18171817
- Bug in :func:`DataFrame.unstack` where a ``ValueError`` was raised when unstacking timezone aware values (:issue:`18338`)
18181818
- Bug in :func:`DataFrame.stack` where timezone aware values were converted to timezone naive values (:issue:`19420`)
18191819
- Bug in :func:`merge_asof` where a ``TypeError`` was raised when ``by_col`` were timezone aware values (:issue:`21184`)
1820+
- Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (:issue:`24212`)
18201821
- Bug showing an incorrect shape when throwing error during ``DataFrame`` construction. (:issue:`20742`)
18211822

18221823
.. _whatsnew_0240.bug_fixes.sparse:

pandas/core/reshape/merge.py

+39-2
Original file line numberDiff line numberDiff line change
@@ -757,13 +757,19 @@ def _get_join_info(self):
757757

758758
if self.right_index:
759759
if len(self.left) > 0:
760-
join_index = self.left.index.take(left_indexer)
760+
join_index = self._create_join_index(self.left.index,
761+
self.right.index,
762+
left_indexer,
763+
how='right')
761764
else:
762765
join_index = self.right.index.take(right_indexer)
763766
left_indexer = np.array([-1] * len(join_index))
764767
elif self.left_index:
765768
if len(self.right) > 0:
766-
join_index = self.right.index.take(right_indexer)
769+
join_index = self._create_join_index(self.right.index,
770+
self.left.index,
771+
right_indexer,
772+
how='left')
767773
else:
768774
join_index = self.left.index.take(left_indexer)
769775
right_indexer = np.array([-1] * len(join_index))
@@ -774,6 +780,37 @@ def _get_join_info(self):
774780
join_index = join_index.astype(object)
775781
return join_index, left_indexer, right_indexer
776782

783+
def _create_join_index(self, index, other_index, indexer, how='left'):
784+
"""
785+
Create a join index by rearranging one index to match another
786+
787+
Parameters
788+
----------
789+
index: Index being rearranged
790+
other_index: Index used to supply values not found in index
791+
indexer: how to rearrange index
792+
how: replacement is only necessary if indexer based on other_index
793+
794+
Returns
795+
-------
796+
join_index
797+
"""
798+
join_index = index.take(indexer)
799+
if (self.how in (how, 'outer') and
800+
not isinstance(other_index, MultiIndex)):
801+
# if final index requires values in other_index but not target
802+
# index, indexer may hold missing (-1) values, causing Index.take
803+
# to take the final value in target index
804+
mask = indexer == -1
805+
if np.any(mask):
806+
# if values missing (-1) from target index,
807+
# take from other_index instead
808+
join_list = join_index.to_numpy()
809+
join_list[mask] = other_index.to_numpy()[mask]
810+
join_index = Index(join_list, dtype=join_index.dtype,
811+
name=join_index.name)
812+
return join_index
813+
777814
def _get_merge_keys(self):
778815
"""
779816
Note: has side effects (copy/delete key columns)

pandas/tests/reshape/merge/test_merge.py

+20
Original file line numberDiff line numberDiff line change
@@ -939,6 +939,26 @@ def test_merge_two_empty_df_no_division_error(self):
939939
with np.errstate(divide='raise'):
940940
merge(a, a, on=('a', 'b'))
941941

942+
@pytest.mark.parametrize('how', ['left', 'outer'])
943+
def test_merge_on_index_with_more_values(self, how):
944+
# GH 24212
945+
# pd.merge gets [-1, -1, 0, 1] as right_indexer, ensure that -1 is
946+
# interpreted as a missing value instead of the last element
947+
df1 = pd.DataFrame([[1, 2], [2, 4], [3, 6], [4, 8]],
948+
columns=['a', 'b'])
949+
df2 = pd.DataFrame([[3, 30], [4, 40]],
950+
columns=['a', 'c'])
951+
df1.set_index('a', drop=False, inplace=True)
952+
df2.set_index('a', inplace=True)
953+
result = pd.merge(df1, df2, left_index=True, right_on='a', how=how)
954+
expected = pd.DataFrame([[1, 2, np.nan],
955+
[2, 4, np.nan],
956+
[3, 6, 30.0],
957+
[4, 8, 40.0]],
958+
columns=['a', 'b', 'c'])
959+
expected.set_index('a', drop=False, inplace=True)
960+
assert_frame_equal(result, expected)
961+
942962

943963
def _check_merge(x, y):
944964
for how in ['inner', 'left', 'outer']:

0 commit comments

Comments
 (0)