diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index b2a379d9fe6f5..9d7868ed1394e 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -37,6 +37,7 @@ Other Enhancements - :class:`RangeIndex` has gained :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop`, and :attr:`~RangeIndex.step` attributes (:issue:`25710`) - :class:`datetime.timezone` objects are now supported as arguments to timezone methods and constructors (:issue:`25065`) - :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`) +- :func:`merge_asof` now gives a more clear error message when merge keys are categoricals that are not equal (:issue:`26136`) .. _whatsnew_0250.api_breaking: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 0837186e33267..2bebb66e10e64 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1446,10 +1446,26 @@ def _get_merge_keys(self): # validate index types are the same for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)): if not is_dtype_equal(lk.dtype, rk.dtype): - raise MergeError("incompatible merge keys [{i}] {lkdtype} and " - "{rkdtype}, must be the same type" - .format(i=i, lkdtype=lk.dtype, - rkdtype=rk.dtype)) + if (is_categorical_dtype(lk.dtype) and + is_categorical_dtype(rk.dtype)): + # The generic error message is confusing for categoricals. + # + # In this function, the join keys include both the original + # ones of the merge_asof() call, and also the keys passed + # to its by= argument. Unordered but equal categories + # are not supported for the former, but will fail + # later with a ValueError, so we don't *need* to check + # for them here. + msg = ("incompatible merge keys [{i}] {lkdtype} and " + "{rkdtype}, both sides category, but not equal ones" + .format(i=i, lkdtype=repr(lk.dtype), + rkdtype=repr(rk.dtype))) + else: + msg = ("incompatible merge keys [{i}] {lkdtype} and " + "{rkdtype}, must be the same type" + .format(i=i, lkdtype=repr(lk.dtype), + rkdtype=repr(rk.dtype))) + raise MergeError(msg) # validate tolerance; must be a Timedelta if we have a DTI if self.tolerance is not None: @@ -1462,7 +1478,7 @@ def _get_merge_keys(self): msg = ("incompatible tolerance {tolerance}, must be compat " "with type {lkdtype}".format( tolerance=type(self.tolerance), - lkdtype=lt.dtype)) + lkdtype=repr(lt.dtype))) if is_datetime64_dtype(lt) or is_datetime64tz_dtype(lt): if not isinstance(self.tolerance, Timedelta): diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 990892f3ccda3..684fba5867c00 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -994,9 +994,8 @@ def test_on_float_by_int(self): assert_frame_equal(result, expected) - def test_merge_datatype_error(self): - """ Tests merge datatype mismatch error """ - msg = r'merge keys \[0\] object and int64, must be the same type' + def test_merge_datatype_error_raises(self): + msg = r'incompatible merge keys \[0\] .*, must be the same type' left = pd.DataFrame({'left_val': [1, 5, 10], 'a': ['a', 'b', 'c']}) @@ -1006,6 +1005,18 @@ def test_merge_datatype_error(self): with pytest.raises(MergeError, match=msg): merge_asof(left, right, on='a') + def test_merge_datatype_categorical_error_raises(self): + msg = (r'incompatible merge keys \[0\] .* both sides category, ' + 'but not equal ones') + + left = pd.DataFrame({'left_val': [1, 5, 10], + 'a': pd.Categorical(['a', 'b', 'c'])}) + right = pd.DataFrame({'right_val': [1, 2, 3, 6, 7], + 'a': pd.Categorical(['a', 'X', 'c', 'X', 'b'])}) + + with pytest.raises(MergeError, match=msg): + merge_asof(left, right, on='a') + @pytest.mark.parametrize('func', [lambda x: x, lambda x: to_datetime(x)], ids=['numeric', 'datetime']) @pytest.mark.parametrize('side', ['left', 'right'])