Skip to content

BUG: merge with mismatched index dtypes failing to raise #53870

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,7 @@ Reshaping
- Bug in :func:`crosstab` when ``dropna=False`` would not keep ``np.nan`` in the result (:issue:`10772`)
- Bug in :func:`merge_asof` raising ``KeyError`` for extension dtypes (:issue:`52904`)
- Bug in :func:`merge_asof` raising ``ValueError`` for data backed by read-only ndarrays (:issue:`53513`)
- Bug in :func:`merge_asof` with ``left_index=True`` or ``right_index=True`` with mismatched index dtypes giving incorrect results in some cases instead of raising ``MergeError`` (:issue:`53870`)
- Bug in :meth:`DataFrame.agg` and :meth:`Series.agg` on non-unique columns would return incorrect type when dist-like argument passed in (:issue:`51099`)
- Bug in :meth:`DataFrame.combine_first` ignoring other's columns if ``other`` is empty (:issue:`53792`)
- Bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax`, where the axis dtype would be lost for empty frames (:issue:`53265`)
Expand All @@ -503,6 +504,7 @@ Reshaping
- Bug in :meth:`DataFrame.stack` sorting columns lexicographically (:issue:`53786`)
- Bug in :meth:`DataFrame.transpose` inferring dtype for object column (:issue:`51546`)
- Bug in :meth:`Series.combine_first` converting ``int64`` dtype to ``float64`` and losing precision on very large integers (:issue:`51764`)
-

Sparse
^^^^^^
Expand Down
37 changes: 28 additions & 9 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1992,11 +1992,10 @@ def _maybe_require_matching_dtypes(
) -> None:
# TODO: why do we do this for AsOfMerge but not the others?

# validate index types are the same
for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)):
if lk.dtype != rk.dtype:
if isinstance(lk.dtype, CategoricalDtype) and isinstance(
rk.dtype, CategoricalDtype
def _check_dtype_match(left: ArrayLike, right: ArrayLike, i: int):
if left.dtype != right.dtype:
if isinstance(left.dtype, CategoricalDtype) and isinstance(
right.dtype, CategoricalDtype
):
# The generic error message is confusing for categoricals.
#
Expand All @@ -2007,16 +2006,32 @@ def _maybe_require_matching_dtypes(
# later with a ValueError, so we don't *need* to check
# for them here.
msg = (
f"incompatible merge keys [{i}] {repr(lk.dtype)} and "
f"{repr(rk.dtype)}, both sides category, but not equal ones"
f"incompatible merge keys [{i}] {repr(left.dtype)} and "
f"{repr(right.dtype)}, both sides category, but not equal ones"
)
else:
msg = (
f"incompatible merge keys [{i}] {repr(lk.dtype)} and "
f"{repr(rk.dtype)}, must be the same type"
f"incompatible merge keys [{i}] {repr(left.dtype)} and "
f"{repr(right.dtype)}, must be the same type"
)
raise MergeError(msg)

# validate index types are the same
for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)):
_check_dtype_match(lk, rk, i)

if self.left_index:
lt = self.left.index._values
else:
lt = left_join_keys[-1]

if self.right_index:
rt = self.right.index._values
else:
rt = right_join_keys[-1]

_check_dtype_match(lt, rt, 0)

def _validate_tolerance(self, left_join_keys: list[ArrayLike]) -> None:
# validate tolerance; datetime.timedelta or Timedelta if we have a DTI
if self.tolerance is not None:
Expand Down Expand Up @@ -2090,6 +2105,10 @@ def injection(obj: ArrayLike):
right_values = (
self.right.index._values if self.right_index else self.right_join_keys[-1]
)

# _maybe_require_matching_dtypes already checked for dtype matching
assert left_values.dtype == right_values.dtype

tolerance = self.tolerance

# we require sortedness and non-null values in the join keys
Expand Down
28 changes: 28 additions & 0 deletions pandas/tests/reshape/merge/test_merge_asof.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,34 @@ def test_multiby_heterogeneous_types(self):
result = merge_asof(trades, quotes, on="time", by=["ticker", "exch"])
tm.assert_frame_equal(result, expected)

def test_mismatched_index_dtype(self):
# similar to test_multiby_indexed, but we change the dtype on left.index
left = pd.DataFrame(
[
[to_datetime("20160602"), 1, "a"],
[to_datetime("20160602"), 2, "a"],
[to_datetime("20160603"), 1, "b"],
[to_datetime("20160603"), 2, "b"],
],
columns=["time", "k1", "k2"],
).set_index("time")
# different dtype for the index
left.index = left.index - pd.Timestamp(0)

right = pd.DataFrame(
[
[to_datetime("20160502"), 1, "a", 1.0],
[to_datetime("20160502"), 2, "a", 2.0],
[to_datetime("20160503"), 1, "b", 3.0],
[to_datetime("20160503"), 2, "b", 4.0],
],
columns=["time", "k1", "k2", "value"],
).set_index("time")

msg = "incompatible merge keys"
with pytest.raises(MergeError, match=msg):
merge_asof(left, right, left_index=True, right_index=True, by=["k1", "k2"])

def test_multiby_indexed(self):
# GH15676
left = pd.DataFrame(
Expand Down