From f89c29c1c77ecbbbea92c6ae9ab9967fd9376726 Mon Sep 17 00:00:00 2001 From: Christian Hudon Date: Mon, 29 Apr 2019 15:40:05 -0400 Subject: [PATCH 1/7] ENH: clearer error msg for unequal categoricals in merge_asof (GH#26136) --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/reshape/merge.py | 16 ++++++++++++---- pandas/tests/reshape/merge/test_merge_asof.py | 12 ++++++++++++ 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index b2a379d9fe6f5..a2d78ed5db913 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -37,6 +37,7 @@ Other Enhancements - :class:`RangeIndex` has gained :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop`, and :attr:`~RangeIndex.step` attributes (:issue:`25710`) - :class:`datetime.timezone` objects are now supported as arguments to timezone methods and constructors (:issue:`25065`) - :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`) +- :func:`merge_asof` now gives a clearer error message when merge keys are categoricals that are not equal (:issue:`26136`) .. _whatsnew_0250.api_breaking: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 0837186e33267..bb13b959ebfd2 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1446,10 +1446,18 @@ def _get_merge_keys(self): # validate index types are the same for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)): if not is_dtype_equal(lk.dtype, rk.dtype): - raise MergeError("incompatible merge keys [{i}] {lkdtype} and " - "{rkdtype}, must be the same type" - .format(i=i, lkdtype=lk.dtype, - rkdtype=rk.dtype)) + if (is_categorical_dtype(lk.dtype) and + is_categorical_dtype(rk.dtype)): + # The generic error message is confusing for categoricals. + msg = ("incompatible merge keys [{i}] both sides " + "category, but not equal ones" + .format(i=i)) + else: + msg = ("incompatible merge keys [{i}] {lkdtype} and " + "{rkdtype}, must be the same type" + .format(i=i, lkdtype=lk.dtype, + rkdtype=rk.dtype)) + raise MergeError(msg) # validate tolerance; must be a Timedelta if we have a DTI if self.tolerance is not None: diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 990892f3ccda3..7eb5f797751eb 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1006,6 +1006,18 @@ def test_merge_datatype_error(self): with pytest.raises(MergeError, match=msg): merge_asof(left, right, on='a') + def test_merge_datatype_categorical_error(self): + """ Tests merge datatype mismatch error """ + msg = r'merge keys \[0\] both sides category, but not equal ones' + + left = pd.DataFrame({'left_val': [1, 5, 10], + 'a': pd.Categorical(['a', 'b', 'c'])}) + right = pd.DataFrame({'right_val': [1, 2, 3, 6, 7], + 'a': pd.Categorical(['a', 'X', 'c', 'X', 'b'])}) + + with pytest.raises(MergeError, match=msg): + merge_asof(left, right, on='a') + @pytest.mark.parametrize('func', [lambda x: x, lambda x: to_datetime(x)], ids=['numeric', 'datetime']) @pytest.mark.parametrize('side', ['left', 'right']) From 93baa93c1f560f071e025e1a8c91b6b99e5dc806 Mon Sep 17 00:00:00 2001 From: Christian Hudon Date: Mon, 29 Apr 2019 17:01:02 -0400 Subject: [PATCH 2/7] Feedback from code review --- pandas/tests/reshape/merge/test_merge_asof.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 7eb5f797751eb..f5f8e38c7d894 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -994,8 +994,7 @@ def test_on_float_by_int(self): assert_frame_equal(result, expected) - def test_merge_datatype_error(self): - """ Tests merge datatype mismatch error """ + def test_merge_datatype_error_raises(self): msg = r'merge keys \[0\] object and int64, must be the same type' left = pd.DataFrame({'left_val': [1, 5, 10], @@ -1006,8 +1005,7 @@ def test_merge_datatype_error(self): with pytest.raises(MergeError, match=msg): merge_asof(left, right, on='a') - def test_merge_datatype_categorical_error(self): - """ Tests merge datatype mismatch error """ + def test_merge_datatype_categorical_error_raises(self): msg = r'merge keys \[0\] both sides category, but not equal ones' left = pd.DataFrame({'left_val': [1, 5, 10], From 919f42b1b0f98a1a1d7498958f236d984015c74d Mon Sep 17 00:00:00 2001 From: Christian Hudon Date: Tue, 30 Apr 2019 09:59:10 -0400 Subject: [PATCH 3/7] Tweak changelog message --- doc/source/whatsnew/v0.25.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index a2d78ed5db913..9d7868ed1394e 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -37,7 +37,7 @@ Other Enhancements - :class:`RangeIndex` has gained :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop`, and :attr:`~RangeIndex.step` attributes (:issue:`25710`) - :class:`datetime.timezone` objects are now supported as arguments to timezone methods and constructors (:issue:`25065`) - :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`) -- :func:`merge_asof` now gives a clearer error message when merge keys are categoricals that are not equal (:issue:`26136`) +- :func:`merge_asof` now gives a more clear error message when merge keys are categoricals that are not equal (:issue:`26136`) .. _whatsnew_0250.api_breaking: From 71dcb80554d0870bb306f93416d8d1bbb17b66f1 Mon Sep 17 00:00:00 2001 From: Christian Hudon Date: Tue, 30 Apr 2019 11:15:48 -0400 Subject: [PATCH 4/7] Work in progress for new test for unordered categoricals --- pandas/core/reshape/merge.py | 10 +++++++++- pandas/tests/reshape/merge/test_merge_asof.py | 15 ++++++++++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index bb13b959ebfd2..08eae9248ddb6 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1443,8 +1443,16 @@ def _get_merge_keys(self): right_join_keys, join_names) = super()._get_merge_keys() - # validate index types are the same + # validate index types are the same & are not unordered categoricals for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)): + # TODO Wrong place to put this. By the time we get here, the "by" + # keys (which *can* be unordered) have been added to the join keys. + if any(is_categorical_dtype(dtype) and not dtype.ordered for + dtype in [lk, rk]): + raise MergeError("incompatible merge keys [{i}] unordered " + "category, must be ordered".format(i=i)) + + if not is_dtype_equal(lk.dtype, rk.dtype): if (is_categorical_dtype(lk.dtype) and is_categorical_dtype(rk.dtype)): diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index f5f8e38c7d894..76dc97975a1a9 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1008,10 +1008,23 @@ def test_merge_datatype_error_raises(self): def test_merge_datatype_categorical_error_raises(self): msg = r'merge keys \[0\] both sides category, but not equal ones' + left = pd.DataFrame({'left_val': [1, 5, 10], + 'a': pd.Categorical(['a', 'b', 'c'], + ordered=True)}) + right = pd.DataFrame({'right_val': [1, 2, 3, 6, 7], + 'a': pd.Categorical(['a', 'X', 'c', 'X', 'b'], + ordered=True)}) + + with pytest.raises(MergeError, match=msg): + merge_asof(left, right, on='a') + + def test_merge_datatype_unordered_categorical_raises(self): + msg = r'merge keys \[0\] unordered category, must be ordered' + left = pd.DataFrame({'left_val': [1, 5, 10], 'a': pd.Categorical(['a', 'b', 'c'])}) right = pd.DataFrame({'right_val': [1, 2, 3, 6, 7], - 'a': pd.Categorical(['a', 'X', 'c', 'X', 'b'])}) + 'a': pd.Categorical(['a', 'c', 'b', 'a', 'b'])}) with pytest.raises(MergeError, match=msg): merge_asof(left, right, on='a') From 161d917f88cf7daf7cd79982a275c77e9d8c5c66 Mon Sep 17 00:00:00 2001 From: Christian Hudon Date: Tue, 30 Apr 2019 13:06:42 -0400 Subject: [PATCH 5/7] Revert "Work in progress for new test for unordered categoricals" This reverts commit 71dcb80554d0870bb306f93416d8d1bbb17b66f1. --- pandas/core/reshape/merge.py | 10 +--------- pandas/tests/reshape/merge/test_merge_asof.py | 15 +-------------- 2 files changed, 2 insertions(+), 23 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 08eae9248ddb6..bb13b959ebfd2 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1443,16 +1443,8 @@ def _get_merge_keys(self): right_join_keys, join_names) = super()._get_merge_keys() - # validate index types are the same & are not unordered categoricals + # validate index types are the same for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)): - # TODO Wrong place to put this. By the time we get here, the "by" - # keys (which *can* be unordered) have been added to the join keys. - if any(is_categorical_dtype(dtype) and not dtype.ordered for - dtype in [lk, rk]): - raise MergeError("incompatible merge keys [{i}] unordered " - "category, must be ordered".format(i=i)) - - if not is_dtype_equal(lk.dtype, rk.dtype): if (is_categorical_dtype(lk.dtype) and is_categorical_dtype(rk.dtype)): diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 76dc97975a1a9..f5f8e38c7d894 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1008,23 +1008,10 @@ def test_merge_datatype_error_raises(self): def test_merge_datatype_categorical_error_raises(self): msg = r'merge keys \[0\] both sides category, but not equal ones' - left = pd.DataFrame({'left_val': [1, 5, 10], - 'a': pd.Categorical(['a', 'b', 'c'], - ordered=True)}) - right = pd.DataFrame({'right_val': [1, 2, 3, 6, 7], - 'a': pd.Categorical(['a', 'X', 'c', 'X', 'b'], - ordered=True)}) - - with pytest.raises(MergeError, match=msg): - merge_asof(left, right, on='a') - - def test_merge_datatype_unordered_categorical_raises(self): - msg = r'merge keys \[0\] unordered category, must be ordered' - left = pd.DataFrame({'left_val': [1, 5, 10], 'a': pd.Categorical(['a', 'b', 'c'])}) right = pd.DataFrame({'right_val': [1, 2, 3, 6, 7], - 'a': pd.Categorical(['a', 'c', 'b', 'a', 'b'])}) + 'a': pd.Categorical(['a', 'X', 'c', 'X', 'b'])}) with pytest.raises(MergeError, match=msg): merge_asof(left, right, on='a') From 5ef471d085269688a4ef2495b293d8b50ed5c45a Mon Sep 17 00:00:00 2001 From: Christian Hudon Date: Tue, 30 Apr 2019 13:27:12 -0400 Subject: [PATCH 6/7] Add comment to clarify why the test is not checking for ordered categories --- pandas/core/reshape/merge.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index bb13b959ebfd2..a7baa41acfa72 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1449,6 +1449,13 @@ def _get_merge_keys(self): if (is_categorical_dtype(lk.dtype) and is_categorical_dtype(rk.dtype)): # The generic error message is confusing for categoricals. + # + # In this function, the join keys include both the original + # ones of the merge_asof() call, and also the keys passed + # to its by= argument. Unordered but equal categories + # are not supported for the former, but will fail + # later with a ValueError, so we don't *need* to check + # for them here. msg = ("incompatible merge keys [{i}] both sides " "category, but not equal ones" .format(i=i)) From 464fa3473689adc861fec3b122ee675d3f57ec4f Mon Sep 17 00:00:00 2001 From: Christian Hudon Date: Wed, 1 May 2019 09:46:19 -0400 Subject: [PATCH 7/7] Use repr(x.dtype) in MergeError messages --- pandas/core/reshape/merge.py | 13 +++++++------ pandas/tests/reshape/merge/test_merge_asof.py | 5 +++-- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index a7baa41acfa72..2bebb66e10e64 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1456,14 +1456,15 @@ def _get_merge_keys(self): # are not supported for the former, but will fail # later with a ValueError, so we don't *need* to check # for them here. - msg = ("incompatible merge keys [{i}] both sides " - "category, but not equal ones" - .format(i=i)) + msg = ("incompatible merge keys [{i}] {lkdtype} and " + "{rkdtype}, both sides category, but not equal ones" + .format(i=i, lkdtype=repr(lk.dtype), + rkdtype=repr(rk.dtype))) else: msg = ("incompatible merge keys [{i}] {lkdtype} and " "{rkdtype}, must be the same type" - .format(i=i, lkdtype=lk.dtype, - rkdtype=rk.dtype)) + .format(i=i, lkdtype=repr(lk.dtype), + rkdtype=repr(rk.dtype))) raise MergeError(msg) # validate tolerance; must be a Timedelta if we have a DTI @@ -1477,7 +1478,7 @@ def _get_merge_keys(self): msg = ("incompatible tolerance {tolerance}, must be compat " "with type {lkdtype}".format( tolerance=type(self.tolerance), - lkdtype=lt.dtype)) + lkdtype=repr(lt.dtype))) if is_datetime64_dtype(lt) or is_datetime64tz_dtype(lt): if not isinstance(self.tolerance, Timedelta): diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index f5f8e38c7d894..684fba5867c00 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -995,7 +995,7 @@ def test_on_float_by_int(self): assert_frame_equal(result, expected) def test_merge_datatype_error_raises(self): - msg = r'merge keys \[0\] object and int64, must be the same type' + msg = r'incompatible merge keys \[0\] .*, must be the same type' left = pd.DataFrame({'left_val': [1, 5, 10], 'a': ['a', 'b', 'c']}) @@ -1006,7 +1006,8 @@ def test_merge_datatype_error_raises(self): merge_asof(left, right, on='a') def test_merge_datatype_categorical_error_raises(self): - msg = r'merge keys \[0\] both sides category, but not equal ones' + msg = (r'incompatible merge keys \[0\] .* both sides category, ' + 'but not equal ones') left = pd.DataFrame({'left_val': [1, 5, 10], 'a': pd.Categorical(['a', 'b', 'c'])})