Skip to content

Commit 9824c57

Browse files
authored
BUG: merge with mismatched index dtypes failing to raise (#53870)
* BUG: merge with mismatched index dtypes failing to raise * gh ref
1 parent 49dc7b9 commit 9824c57

File tree

3 files changed

+58
-9
lines changed

3 files changed

+58
-9
lines changed

doc/source/whatsnew/v2.1.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -495,6 +495,7 @@ Reshaping
495495
- Bug in :func:`crosstab` when ``dropna=False`` would not keep ``np.nan`` in the result (:issue:`10772`)
496496
- Bug in :func:`merge_asof` raising ``KeyError`` for extension dtypes (:issue:`52904`)
497497
- Bug in :func:`merge_asof` raising ``ValueError`` for data backed by read-only ndarrays (:issue:`53513`)
498+
- Bug in :func:`merge_asof` with ``left_index=True`` or ``right_index=True`` with mismatched index dtypes giving incorrect results in some cases instead of raising ``MergeError`` (:issue:`53870`)
498499
- Bug in :meth:`DataFrame.agg` and :meth:`Series.agg` on non-unique columns would return incorrect type when dist-like argument passed in (:issue:`51099`)
499500
- Bug in :meth:`DataFrame.combine_first` ignoring other's columns if ``other`` is empty (:issue:`53792`)
500501
- Bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax`, where the axis dtype would be lost for empty frames (:issue:`53265`)
@@ -503,6 +504,7 @@ Reshaping
503504
- Bug in :meth:`DataFrame.stack` sorting columns lexicographically (:issue:`53786`)
504505
- Bug in :meth:`DataFrame.transpose` inferring dtype for object column (:issue:`51546`)
505506
- Bug in :meth:`Series.combine_first` converting ``int64`` dtype to ``float64`` and losing precision on very large integers (:issue:`51764`)
507+
-
506508

507509
Sparse
508510
^^^^^^

pandas/core/reshape/merge.py

+28-9
Original file line numberDiff line numberDiff line change
@@ -1992,11 +1992,10 @@ def _maybe_require_matching_dtypes(
19921992
) -> None:
19931993
# TODO: why do we do this for AsOfMerge but not the others?
19941994

1995-
# validate index types are the same
1996-
for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)):
1997-
if lk.dtype != rk.dtype:
1998-
if isinstance(lk.dtype, CategoricalDtype) and isinstance(
1999-
rk.dtype, CategoricalDtype
1995+
def _check_dtype_match(left: ArrayLike, right: ArrayLike, i: int):
1996+
if left.dtype != right.dtype:
1997+
if isinstance(left.dtype, CategoricalDtype) and isinstance(
1998+
right.dtype, CategoricalDtype
20001999
):
20012000
# The generic error message is confusing for categoricals.
20022001
#
@@ -2007,16 +2006,32 @@ def _maybe_require_matching_dtypes(
20072006
# later with a ValueError, so we don't *need* to check
20082007
# for them here.
20092008
msg = (
2010-
f"incompatible merge keys [{i}] {repr(lk.dtype)} and "
2011-
f"{repr(rk.dtype)}, both sides category, but not equal ones"
2009+
f"incompatible merge keys [{i}] {repr(left.dtype)} and "
2010+
f"{repr(right.dtype)}, both sides category, but not equal ones"
20122011
)
20132012
else:
20142013
msg = (
2015-
f"incompatible merge keys [{i}] {repr(lk.dtype)} and "
2016-
f"{repr(rk.dtype)}, must be the same type"
2014+
f"incompatible merge keys [{i}] {repr(left.dtype)} and "
2015+
f"{repr(right.dtype)}, must be the same type"
20172016
)
20182017
raise MergeError(msg)
20192018

2019+
# validate index types are the same
2020+
for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)):
2021+
_check_dtype_match(lk, rk, i)
2022+
2023+
if self.left_index:
2024+
lt = self.left.index._values
2025+
else:
2026+
lt = left_join_keys[-1]
2027+
2028+
if self.right_index:
2029+
rt = self.right.index._values
2030+
else:
2031+
rt = right_join_keys[-1]
2032+
2033+
_check_dtype_match(lt, rt, 0)
2034+
20202035
def _validate_tolerance(self, left_join_keys: list[ArrayLike]) -> None:
20212036
# validate tolerance; datetime.timedelta or Timedelta if we have a DTI
20222037
if self.tolerance is not None:
@@ -2090,6 +2105,10 @@ def injection(obj: ArrayLike):
20902105
right_values = (
20912106
self.right.index._values if self.right_index else self.right_join_keys[-1]
20922107
)
2108+
2109+
# _maybe_require_matching_dtypes already checked for dtype matching
2110+
assert left_values.dtype == right_values.dtype
2111+
20932112
tolerance = self.tolerance
20942113

20952114
# we require sortedness and non-null values in the join keys

pandas/tests/reshape/merge/test_merge_asof.py

+28
Original file line numberDiff line numberDiff line change
@@ -418,6 +418,34 @@ def test_multiby_heterogeneous_types(self):
418418
result = merge_asof(trades, quotes, on="time", by=["ticker", "exch"])
419419
tm.assert_frame_equal(result, expected)
420420

421+
def test_mismatched_index_dtype(self):
422+
# similar to test_multiby_indexed, but we change the dtype on left.index
423+
left = pd.DataFrame(
424+
[
425+
[to_datetime("20160602"), 1, "a"],
426+
[to_datetime("20160602"), 2, "a"],
427+
[to_datetime("20160603"), 1, "b"],
428+
[to_datetime("20160603"), 2, "b"],
429+
],
430+
columns=["time", "k1", "k2"],
431+
).set_index("time")
432+
# different dtype for the index
433+
left.index = left.index - pd.Timestamp(0)
434+
435+
right = pd.DataFrame(
436+
[
437+
[to_datetime("20160502"), 1, "a", 1.0],
438+
[to_datetime("20160502"), 2, "a", 2.0],
439+
[to_datetime("20160503"), 1, "b", 3.0],
440+
[to_datetime("20160503"), 2, "b", 4.0],
441+
],
442+
columns=["time", "k1", "k2", "value"],
443+
).set_index("time")
444+
445+
msg = "incompatible merge keys"
446+
with pytest.raises(MergeError, match=msg):
447+
merge_asof(left, right, left_index=True, right_index=True, by=["k1", "k2"])
448+
421449
def test_multiby_indexed(self):
422450
# GH15676
423451
left = pd.DataFrame(

0 commit comments

Comments
 (0)