diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 6a0470b839843..ee1f1b7be1b86 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -495,6 +495,7 @@ Reshaping - Bug in :func:`crosstab` when ``dropna=False`` would not keep ``np.nan`` in the result (:issue:`10772`) - Bug in :func:`merge_asof` raising ``KeyError`` for extension dtypes (:issue:`52904`) - Bug in :func:`merge_asof` raising ``ValueError`` for data backed by read-only ndarrays (:issue:`53513`) +- Bug in :func:`merge_asof` with ``left_index=True`` or ``right_index=True`` with mismatched index dtypes giving incorrect results in some cases instead of raising ``MergeError`` (:issue:`53870`) - Bug in :meth:`DataFrame.agg` and :meth:`Series.agg` on non-unique columns would return incorrect type when dist-like argument passed in (:issue:`51099`) - Bug in :meth:`DataFrame.combine_first` ignoring other's columns if ``other`` is empty (:issue:`53792`) - Bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax`, where the axis dtype would be lost for empty frames (:issue:`53265`) @@ -503,6 +504,7 @@ Reshaping - Bug in :meth:`DataFrame.stack` sorting columns lexicographically (:issue:`53786`) - Bug in :meth:`DataFrame.transpose` inferring dtype for object column (:issue:`51546`) - Bug in :meth:`Series.combine_first` converting ``int64`` dtype to ``float64`` and losing precision on very large integers (:issue:`51764`) +- Sparse ^^^^^^ diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 200e67aee80cc..e68277c38063e 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1992,11 +1992,10 @@ def _maybe_require_matching_dtypes( ) -> None: # TODO: why do we do this for AsOfMerge but not the others? - # validate index types are the same - for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)): - if lk.dtype != rk.dtype: - if isinstance(lk.dtype, CategoricalDtype) and isinstance( - rk.dtype, CategoricalDtype + def _check_dtype_match(left: ArrayLike, right: ArrayLike, i: int): + if left.dtype != right.dtype: + if isinstance(left.dtype, CategoricalDtype) and isinstance( + right.dtype, CategoricalDtype ): # The generic error message is confusing for categoricals. # @@ -2007,16 +2006,32 @@ def _maybe_require_matching_dtypes( # later with a ValueError, so we don't *need* to check # for them here. msg = ( - f"incompatible merge keys [{i}] {repr(lk.dtype)} and " - f"{repr(rk.dtype)}, both sides category, but not equal ones" + f"incompatible merge keys [{i}] {repr(left.dtype)} and " + f"{repr(right.dtype)}, both sides category, but not equal ones" ) else: msg = ( - f"incompatible merge keys [{i}] {repr(lk.dtype)} and " - f"{repr(rk.dtype)}, must be the same type" + f"incompatible merge keys [{i}] {repr(left.dtype)} and " + f"{repr(right.dtype)}, must be the same type" ) raise MergeError(msg) + # validate index types are the same + for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)): + _check_dtype_match(lk, rk, i) + + if self.left_index: + lt = self.left.index._values + else: + lt = left_join_keys[-1] + + if self.right_index: + rt = self.right.index._values + else: + rt = right_join_keys[-1] + + _check_dtype_match(lt, rt, 0) + def _validate_tolerance(self, left_join_keys: list[ArrayLike]) -> None: # validate tolerance; datetime.timedelta or Timedelta if we have a DTI if self.tolerance is not None: @@ -2090,6 +2105,10 @@ def injection(obj: ArrayLike): right_values = ( self.right.index._values if self.right_index else self.right_join_keys[-1] ) + + # _maybe_require_matching_dtypes already checked for dtype matching + assert left_values.dtype == right_values.dtype + tolerance = self.tolerance # we require sortedness and non-null values in the join keys diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 0678074943ffb..67591b2adc2f9 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -418,6 +418,34 @@ def test_multiby_heterogeneous_types(self): result = merge_asof(trades, quotes, on="time", by=["ticker", "exch"]) tm.assert_frame_equal(result, expected) + def test_mismatched_index_dtype(self): + # similar to test_multiby_indexed, but we change the dtype on left.index + left = pd.DataFrame( + [ + [to_datetime("20160602"), 1, "a"], + [to_datetime("20160602"), 2, "a"], + [to_datetime("20160603"), 1, "b"], + [to_datetime("20160603"), 2, "b"], + ], + columns=["time", "k1", "k2"], + ).set_index("time") + # different dtype for the index + left.index = left.index - pd.Timestamp(0) + + right = pd.DataFrame( + [ + [to_datetime("20160502"), 1, "a", 1.0], + [to_datetime("20160502"), 2, "a", 2.0], + [to_datetime("20160503"), 1, "b", 3.0], + [to_datetime("20160503"), 2, "b", 4.0], + ], + columns=["time", "k1", "k2", "value"], + ).set_index("time") + + msg = "incompatible merge keys" + with pytest.raises(MergeError, match=msg): + merge_asof(left, right, left_index=True, right_index=True, by=["k1", "k2"]) + def test_multiby_indexed(self): # GH15676 left = pd.DataFrame(