Skip to content

Commit c2c7939

Browse files
chrish42jreback
authored andcommitted
ENH: clearer error msg for unequal categoricals in merge_asof (GH#26136) (#26242)
1 parent 4df308f commit c2c7939

File tree

3 files changed

+36
-8
lines changed

3 files changed

+36
-8
lines changed

doc/source/whatsnew/v0.25.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ Other Enhancements
3838
- :class:`RangeIndex` has gained :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop`, and :attr:`~RangeIndex.step` attributes (:issue:`25710`)
3939
- :class:`datetime.timezone` objects are now supported as arguments to timezone methods and constructors (:issue:`25065`)
4040
- :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`)
41+
- :func:`merge_asof` now gives a more clear error message when merge keys are categoricals that are not equal (:issue:`26136`)
4142

4243
.. _whatsnew_0250.api_breaking:
4344

pandas/core/reshape/merge.py

+21-5
Original file line numberDiff line numberDiff line change
@@ -1446,10 +1446,26 @@ def _get_merge_keys(self):
14461446
# validate index types are the same
14471447
for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)):
14481448
if not is_dtype_equal(lk.dtype, rk.dtype):
1449-
raise MergeError("incompatible merge keys [{i}] {lkdtype} and "
1450-
"{rkdtype}, must be the same type"
1451-
.format(i=i, lkdtype=lk.dtype,
1452-
rkdtype=rk.dtype))
1449+
if (is_categorical_dtype(lk.dtype) and
1450+
is_categorical_dtype(rk.dtype)):
1451+
# The generic error message is confusing for categoricals.
1452+
#
1453+
# In this function, the join keys include both the original
1454+
# ones of the merge_asof() call, and also the keys passed
1455+
# to its by= argument. Unordered but equal categories
1456+
# are not supported for the former, but will fail
1457+
# later with a ValueError, so we don't *need* to check
1458+
# for them here.
1459+
msg = ("incompatible merge keys [{i}] {lkdtype} and "
1460+
"{rkdtype}, both sides category, but not equal ones"
1461+
.format(i=i, lkdtype=repr(lk.dtype),
1462+
rkdtype=repr(rk.dtype)))
1463+
else:
1464+
msg = ("incompatible merge keys [{i}] {lkdtype} and "
1465+
"{rkdtype}, must be the same type"
1466+
.format(i=i, lkdtype=repr(lk.dtype),
1467+
rkdtype=repr(rk.dtype)))
1468+
raise MergeError(msg)
14531469

14541470
# validate tolerance; must be a Timedelta if we have a DTI
14551471
if self.tolerance is not None:
@@ -1462,7 +1478,7 @@ def _get_merge_keys(self):
14621478
msg = ("incompatible tolerance {tolerance}, must be compat "
14631479
"with type {lkdtype}".format(
14641480
tolerance=type(self.tolerance),
1465-
lkdtype=lt.dtype))
1481+
lkdtype=repr(lt.dtype)))
14661482

14671483
if is_datetime64_dtype(lt) or is_datetime64tz_dtype(lt):
14681484
if not isinstance(self.tolerance, Timedelta):

pandas/tests/reshape/merge/test_merge_asof.py

+14-3
Original file line numberDiff line numberDiff line change
@@ -994,9 +994,8 @@ def test_on_float_by_int(self):
994994

995995
assert_frame_equal(result, expected)
996996

997-
def test_merge_datatype_error(self):
998-
""" Tests merge datatype mismatch error """
999-
msg = r'merge keys \[0\] object and int64, must be the same type'
997+
def test_merge_datatype_error_raises(self):
998+
msg = r'incompatible merge keys \[0\] .*, must be the same type'
1000999

10011000
left = pd.DataFrame({'left_val': [1, 5, 10],
10021001
'a': ['a', 'b', 'c']})
@@ -1006,6 +1005,18 @@ def test_merge_datatype_error(self):
10061005
with pytest.raises(MergeError, match=msg):
10071006
merge_asof(left, right, on='a')
10081007

1008+
def test_merge_datatype_categorical_error_raises(self):
1009+
msg = (r'incompatible merge keys \[0\] .* both sides category, '
1010+
'but not equal ones')
1011+
1012+
left = pd.DataFrame({'left_val': [1, 5, 10],
1013+
'a': pd.Categorical(['a', 'b', 'c'])})
1014+
right = pd.DataFrame({'right_val': [1, 2, 3, 6, 7],
1015+
'a': pd.Categorical(['a', 'X', 'c', 'X', 'b'])})
1016+
1017+
with pytest.raises(MergeError, match=msg):
1018+
merge_asof(left, right, on='a')
1019+
10091020
@pytest.mark.parametrize('func', [lambda x: x, lambda x: to_datetime(x)],
10101021
ids=['numeric', 'datetime'])
10111022
@pytest.mark.parametrize('side', ['left', 'right'])

0 commit comments

Comments
 (0)