Skip to content

Commit cc3b2f0

Browse files
JustinZhengBCjreback
authored andcommitted
BUG-24212 fix when other_index has incompatible dtype (#25009)
1 parent ec2846a commit cc3b2f0

File tree

3 files changed

+48
-16
lines changed

3 files changed

+48
-16
lines changed

doc/source/whatsnew/v0.25.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -404,7 +404,7 @@ Reshaping
404404
^^^^^^^^^
405405

406406
- Bug in :func:`pandas.merge` adds a string of ``None``, if ``None`` is assigned in suffixes instead of remain the column name as-is (:issue:`24782`).
407-
- Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (:issue:`24212`)
407+
- Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (missing index values are now assigned NA) (:issue:`24212`, :issue:`25009`)
408408
- :func:`to_records` now accepts dtypes to its ``column_dtypes`` parameter (:issue:`24895`)
409409
- Bug in :func:`concat` where order of ``OrderedDict`` (and ``dict`` in Python 3.6+) is not respected, when passed in as ``objs`` argument (:issue:`21510`)
410410
- Bug in :func:`pivot_table` where columns with ``NaN`` values are dropped even if ``dropna`` argument is ``False``, when the ``aggfunc`` argument contains a ``list`` (:issue:`22159`)

pandas/core/reshape/merge.py

+6-10
Original file line numberDiff line numberDiff line change
@@ -803,22 +803,18 @@ def _create_join_index(self, index, other_index, indexer,
803803
-------
804804
join_index
805805
"""
806-
join_index = index.take(indexer)
807806
if (self.how in (how, 'outer') and
808807
not isinstance(other_index, MultiIndex)):
809808
# if final index requires values in other_index but not target
810809
# index, indexer may hold missing (-1) values, causing Index.take
811-
# to take the final value in target index
810+
# to take the final value in target index. So, we set the last
811+
# element to be the desired fill value. We do not use allow_fill
812+
# and fill_value because it throws a ValueError on integer indices
812813
mask = indexer == -1
813814
if np.any(mask):
814-
# if values missing (-1) from target index,
815-
# take from other_index instead
816-
join_list = join_index.to_numpy()
817-
other_list = other_index.take(other_indexer).to_numpy()
818-
join_list[mask] = other_list[mask]
819-
join_index = Index(join_list, dtype=join_index.dtype,
820-
name=join_index.name)
821-
return join_index
815+
fill_value = na_value_for_dtype(index.dtype, compat=False)
816+
index = index.append(Index([fill_value]))
817+
return index.take(indexer)
822818

823819
def _get_merge_keys(self):
824820
"""

pandas/tests/reshape/merge/test_merge.py

+41-5
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
import pandas as pd
1616
from pandas import (
1717
Categorical, CategoricalIndex, DataFrame, DatetimeIndex, Float64Index,
18-
Int64Index, MultiIndex, RangeIndex, Series, UInt64Index)
18+
Int64Index, IntervalIndex, MultiIndex, PeriodIndex, RangeIndex, Series,
19+
TimedeltaIndex, UInt64Index)
1920
from pandas.api.types import CategoricalDtype as CDT
2021
from pandas.core.reshape.concat import concat
2122
from pandas.core.reshape.merge import MergeError, merge
@@ -1034,11 +1035,30 @@ def test_merge_two_empty_df_no_division_error(self):
10341035
merge(a, a, on=('a', 'b'))
10351036

10361037
@pytest.mark.parametrize('how', ['right', 'outer'])
1037-
def test_merge_on_index_with_more_values(self, how):
1038+
@pytest.mark.parametrize(
1039+
'index,expected_index',
1040+
[(CategoricalIndex([1, 2, 4]),
1041+
CategoricalIndex([1, 2, 4, None, None, None])),
1042+
(DatetimeIndex(['2001-01-01', '2002-02-02', '2003-03-03']),
1043+
DatetimeIndex(['2001-01-01', '2002-02-02', '2003-03-03',
1044+
pd.NaT, pd.NaT, pd.NaT])),
1045+
(Float64Index([1, 2, 3]),
1046+
Float64Index([1, 2, 3, None, None, None])),
1047+
(Int64Index([1, 2, 3]),
1048+
Float64Index([1, 2, 3, None, None, None])),
1049+
(IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4)]),
1050+
IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4),
1051+
np.nan, np.nan, np.nan])),
1052+
(PeriodIndex(['2001-01-01', '2001-01-02', '2001-01-03'], freq='D'),
1053+
PeriodIndex(['2001-01-01', '2001-01-02', '2001-01-03',
1054+
pd.NaT, pd.NaT, pd.NaT], freq='D')),
1055+
(TimedeltaIndex(['1d', '2d', '3d']),
1056+
TimedeltaIndex(['1d', '2d', '3d', pd.NaT, pd.NaT, pd.NaT]))])
1057+
def test_merge_on_index_with_more_values(self, how, index, expected_index):
10381058
# GH 24212
10391059
# pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that
10401060
# -1 is interpreted as a missing value instead of the last element
1041-
df1 = pd.DataFrame({'a': [1, 2, 3], 'key': [0, 2, 2]})
1061+
df1 = pd.DataFrame({'a': [1, 2, 3], 'key': [0, 2, 2]}, index=index)
10421062
df2 = pd.DataFrame({'b': [1, 2, 3, 4, 5]})
10431063
result = df1.merge(df2, left_on='key', right_index=True, how=how)
10441064
expected = pd.DataFrame([[1.0, 0, 1],
@@ -1048,7 +1068,7 @@ def test_merge_on_index_with_more_values(self, how):
10481068
[np.nan, 3, 4],
10491069
[np.nan, 4, 5]],
10501070
columns=['a', 'key', 'b'])
1051-
expected.set_index(Int64Index([0, 1, 2, 1, 3, 4]), inplace=True)
1071+
expected.set_index(expected_index, inplace=True)
10521072
assert_frame_equal(result, expected)
10531073

10541074
def test_merge_right_index_right(self):
@@ -1062,11 +1082,27 @@ def test_merge_right_index_right(self):
10621082
'key': [0, 1, 1, 2],
10631083
'b': [1, 2, 2, 3]},
10641084
columns=['a', 'key', 'b'],
1065-
index=[0, 1, 2, 2])
1085+
index=[0, 1, 2, np.nan])
10661086
result = left.merge(right, left_on='key', right_index=True,
10671087
how='right')
10681088
tm.assert_frame_equal(result, expected)
10691089

1090+
def test_merge_take_missing_values_from_index_of_other_dtype(self):
1091+
# GH 24212
1092+
left = pd.DataFrame({'a': [1, 2, 3],
1093+
'key': pd.Categorical(['a', 'a', 'b'],
1094+
categories=list('abc'))})
1095+
right = pd.DataFrame({'b': [1, 2, 3]},
1096+
index=pd.CategoricalIndex(['a', 'b', 'c']))
1097+
result = left.merge(right, left_on='key',
1098+
right_index=True, how='right')
1099+
expected = pd.DataFrame({'a': [1, 2, 3, None],
1100+
'key': pd.Categorical(['a', 'a', 'b', 'c']),
1101+
'b': [1, 1, 2, 3]},
1102+
index=[0, 1, 2, np.nan])
1103+
expected = expected.reindex(columns=['a', 'key', 'b'])
1104+
tm.assert_frame_equal(result, expected)
1105+
10701106

10711107
def _check_merge(x, y):
10721108
for how in ['inner', 'left', 'outer']:

0 commit comments

Comments
 (0)