From 660d2f517d4f358d3b5ea50214c3a6382659782e Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 30 May 2020 21:53:43 +0200 Subject: [PATCH 01/23] Raise ValueError for non numerical join columns in merge_asof --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/reshape/merge.py | 8 ++++++++ pandas/tests/reshape/merge/test_merge_asof.py | 8 ++++++++ 3 files changed, 17 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 17a830788be3f..f659f48e5dfd7 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -936,6 +936,7 @@ Reshaping - Bug in :meth:`DataFrame.replace` casts columns to ``object`` dtype if items in ``to_replace`` not in values (:issue:`32988`) - Ensure only named functions can be used in :func:`eval()` (:issue:`32460`) - Fixed bug in :func:`melt` where melting MultiIndex columns with ``col_level`` > 0 would raise a ``KeyError`` on ``id_vars`` (:issue:`34129`) +- :meth:`merge_asof` raises ``ValueError`` instead of cryptic ``TypeError`` in case of non numerical merge columns (:issue:`29130`) Sparse ^^^^^^ diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 0c796c8f45a52..a6aeb6be3a0ce 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -26,6 +26,7 @@ is_bool_dtype, is_categorical_dtype, is_datetime64tz_dtype, + is_datetime64_any_dtype, is_dtype_equal, is_extension_array_dtype, is_float_dtype, @@ -35,6 +36,7 @@ is_number, is_numeric_dtype, is_object_dtype, + is_timedelta64_dtype, needs_i8_conversion, ) from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries @@ -1730,6 +1732,12 @@ def flip(xs) -> np.ndarray: ) tolerance = self.tolerance + if is_object_dtype(right_values): + raise ValueError("Right input is not from numerical dtype") + + if is_object_dtype(left_values.dtype): + raise ValueError("Left input is not from numerical dtype") + # we require sortedness and non-null values in the join keys if not Index(left_values).is_monotonic: side = "left" diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 9b09f0033715d..5593703b52d03 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1339,3 +1339,11 @@ def test_merge_index_column_tz(self): index=pd.Index([0, 1, 2, 3, 4]), ) tm.assert_frame_equal(result, expected) + + +def test_non_numerical_dtype(): + # GH 29130 + left = pd.DataFrame({"x": ["2019-06-01 00:09:12", "2019-06-01 00:10:29"]}) + right = pd.DataFrame({"x": ["2019-06-01 00:09:12", "2019-06-01 00:10:29"]}) + with pytest.raises(ValueError): + pd.merge_asof(left, right, on="x") From cafedccedc3fd3be68f4afb27071ac9aaf47c798 Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 30 May 2020 21:57:19 +0200 Subject: [PATCH 02/23] Delete unused imports --- pandas/core/reshape/merge.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index a6aeb6be3a0ce..538eb1152952e 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -26,7 +26,6 @@ is_bool_dtype, is_categorical_dtype, is_datetime64tz_dtype, - is_datetime64_any_dtype, is_dtype_equal, is_extension_array_dtype, is_float_dtype, @@ -36,7 +35,6 @@ is_number, is_numeric_dtype, is_object_dtype, - is_timedelta64_dtype, needs_i8_conversion, ) from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries From fa1d9090c32f44272e34e0fe65f592d314f88f17 Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 30 May 2020 23:47:12 +0200 Subject: [PATCH 03/23] Implement review requests --- pandas/core/reshape/merge.py | 7 ++----- pandas/tests/reshape/merge/test_merge_asof.py | 9 +++++---- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 538eb1152952e..f766101952572 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1730,11 +1730,8 @@ def flip(xs) -> np.ndarray: ) tolerance = self.tolerance - if is_object_dtype(right_values): - raise ValueError("Right input is not from numerical dtype") - - if is_object_dtype(left_values.dtype): - raise ValueError("Left input is not from numerical dtype") + if is_object_dtype(right_values.dtype) or is_object_dtype(left_values.dtype): + raise TypeError("Both inputs have to be from numeric dtype") # we require sortedness and non-null values in the join keys if not Index(left_values).is_monotonic: diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 5593703b52d03..353f7edb84596 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1341,9 +1341,10 @@ def test_merge_index_column_tz(self): tm.assert_frame_equal(result, expected) -def test_non_numerical_dtype(): +@pytest.mark.parametrize("data", [["2019-06-01 00:09:12", "2019-06-01 00:10:29"], [1.0, "2019-06-01 00:10:29"]]) +def test_non_numerical_dtype(data): # GH 29130 - left = pd.DataFrame({"x": ["2019-06-01 00:09:12", "2019-06-01 00:10:29"]}) - right = pd.DataFrame({"x": ["2019-06-01 00:09:12", "2019-06-01 00:10:29"]}) - with pytest.raises(ValueError): + left = pd.DataFrame({"x": data}) + right = pd.DataFrame({"x": data}) + with pytest.raises(TypeError, match="Both inputs have to be from numeric dtype"): pd.merge_asof(left, right, on="x") From 34a5633dd9c358046b5eaa8f6ceb022a86cabfdd Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 30 May 2020 23:49:15 +0200 Subject: [PATCH 04/23] Run black pandas --- pandas/tests/reshape/merge/test_merge_asof.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 353f7edb84596..b80f1f16d81c3 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1341,7 +1341,10 @@ def test_merge_index_column_tz(self): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data", [["2019-06-01 00:09:12", "2019-06-01 00:10:29"], [1.0, "2019-06-01 00:10:29"]]) +@pytest.mark.parametrize( + "data", + [["2019-06-01 00:09:12", "2019-06-01 00:10:29"], [1.0, "2019-06-01 00:10:29"]], +) def test_non_numerical_dtype(data): # GH 29130 left = pd.DataFrame({"x": data}) From 783bc601f633f5bb096531b2c6210f3b30d06474 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 31 May 2020 22:21:34 +0200 Subject: [PATCH 05/23] Change error message --- pandas/core/reshape/merge.py | 2 +- pandas/tests/reshape/merge/test_merge_asof.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index f766101952572..08b1eb6d775e9 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1731,7 +1731,7 @@ def flip(xs) -> np.ndarray: tolerance = self.tolerance if is_object_dtype(right_values.dtype) or is_object_dtype(left_values.dtype): - raise TypeError("Both inputs have to be from numeric dtype") + raise TypeError("Both inputs must have numeric dtype") # we require sortedness and non-null values in the join keys if not Index(left_values).is_monotonic: diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index b80f1f16d81c3..c7fff4a5819ad 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1349,5 +1349,5 @@ def test_non_numerical_dtype(data): # GH 29130 left = pd.DataFrame({"x": data}) right = pd.DataFrame({"x": data}) - with pytest.raises(TypeError, match="Both inputs have to be from numeric dtype"): + with pytest.raises(TypeError, match="Both inputs must have numeric dtype"): pd.merge_asof(left, right, on="x") From e0868646641c74b78e1bdd4a48e6a2a7448210ef Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 31 May 2020 22:34:53 +0200 Subject: [PATCH 06/23] Change whats new entry --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index f659f48e5dfd7..6190f3fb5ebb7 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -936,7 +936,7 @@ Reshaping - Bug in :meth:`DataFrame.replace` casts columns to ``object`` dtype if items in ``to_replace`` not in values (:issue:`32988`) - Ensure only named functions can be used in :func:`eval()` (:issue:`32460`) - Fixed bug in :func:`melt` where melting MultiIndex columns with ``col_level`` > 0 would raise a ``KeyError`` on ``id_vars`` (:issue:`34129`) -- :meth:`merge_asof` raises ``ValueError`` instead of cryptic ``TypeError`` in case of non numerical merge columns (:issue:`29130`) +- :meth:`merge_asof` raises ``ValueError`` instead of cryptic ``TypeError`` in case of non-numerical merge columns (:issue:`29130`) Sparse ^^^^^^ From aaa34804eb3c3356ef71e0f7b942cbbc4de946cf Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 1 Jun 2020 00:12:27 +0200 Subject: [PATCH 07/23] Change error type --- pandas/core/reshape/merge.py | 2 +- pandas/tests/reshape/merge/test_merge_asof.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 08b1eb6d775e9..4f8d390df2f89 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1731,7 +1731,7 @@ def flip(xs) -> np.ndarray: tolerance = self.tolerance if is_object_dtype(right_values.dtype) or is_object_dtype(left_values.dtype): - raise TypeError("Both inputs must have numeric dtype") + raise ValueError("Both inputs must have numeric dtype") # we require sortedness and non-null values in the join keys if not Index(left_values).is_monotonic: diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index c7fff4a5819ad..442ea3cc4cfbb 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1349,5 +1349,5 @@ def test_non_numerical_dtype(data): # GH 29130 left = pd.DataFrame({"x": data}) right = pd.DataFrame({"x": data}) - with pytest.raises(TypeError, match="Both inputs must have numeric dtype"): + with pytest.raises(ValueError, match="Both inputs must have numeric dtype"): pd.merge_asof(left, right, on="x") From ec3147e59c37ecee09508548170de29becafcfd7 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 1 Jun 2020 00:44:10 +0200 Subject: [PATCH 08/23] Move dtype check --- pandas/core/reshape/merge.py | 9 ++++++--- pandas/tests/reshape/merge/test_merge_asof.py | 5 ++++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 4f8d390df2f89..089304208dcaa 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1640,6 +1640,12 @@ def _get_merge_keys(self): # validate index types are the same for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)): + if is_object_dtype(lk.dtype) or is_object_dtype(rk.dtype): + raise ValueError( + f"Incompatible merge key [{i}] dtype, {repr(lk.dtype)} and " + f"{repr(rk.dtype)}, both sides must have numeric dtype" + ) + if not is_dtype_equal(lk.dtype, rk.dtype): if is_categorical_dtype(lk.dtype) and is_categorical_dtype(rk.dtype): # The generic error message is confusing for categoricals. @@ -1730,9 +1736,6 @@ def flip(xs) -> np.ndarray: ) tolerance = self.tolerance - if is_object_dtype(right_values.dtype) or is_object_dtype(left_values.dtype): - raise ValueError("Both inputs must have numeric dtype") - # we require sortedness and non-null values in the join keys if not Index(left_values).is_monotonic: side = "left" diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 442ea3cc4cfbb..7d517476a26a3 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1349,5 +1349,8 @@ def test_non_numerical_dtype(data): # GH 29130 left = pd.DataFrame({"x": data}) right = pd.DataFrame({"x": data}) - with pytest.raises(ValueError, match="Both inputs must have numeric dtype"): + with pytest.raises( + ValueError, + match=f"Incompatible merge key \[0\] .*, both sides must have numeric dtype", + ): pd.merge_asof(left, right, on="x") From 129eb77e7d6be6a7ee634c48ed7151e3e9037636 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 1 Jun 2020 01:17:53 +0200 Subject: [PATCH 09/23] Use raw string --- pandas/tests/reshape/merge/test_merge_asof.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 7d517476a26a3..435f592d5703e 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1351,6 +1351,6 @@ def test_non_numerical_dtype(data): right = pd.DataFrame({"x": data}) with pytest.raises( ValueError, - match=f"Incompatible merge key \[0\] .*, both sides must have numeric dtype", + match=r"Incompatible merge key \[0\] .*, both sides must have numeric dtype", ): pd.merge_asof(left, right, on="x") From 4680833f6385bbea32c56007fa100655c84401c1 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 1 Jun 2020 12:18:33 +0200 Subject: [PATCH 10/23] Move check again --- pandas/core/reshape/merge.py | 11 ++++++----- pandas/tests/reshape/merge/test_merge_asof.py | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 089304208dcaa..807814c2ca570 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1640,11 +1640,6 @@ def _get_merge_keys(self): # validate index types are the same for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)): - if is_object_dtype(lk.dtype) or is_object_dtype(rk.dtype): - raise ValueError( - f"Incompatible merge key [{i}] dtype, {repr(lk.dtype)} and " - f"{repr(rk.dtype)}, both sides must have numeric dtype" - ) if not is_dtype_equal(lk.dtype, rk.dtype): if is_categorical_dtype(lk.dtype) and is_categorical_dtype(rk.dtype): @@ -1736,6 +1731,12 @@ def flip(xs) -> np.ndarray: ) tolerance = self.tolerance + if is_object_dtype(right_values.dtype) or is_object_dtype(left_values.dtype): + raise ValueError( + f"Incompatible merge dtype, {repr(left_values.dtype)} and " + f"{repr(right_values.dtype)}, both sides must have numeric dtype" + ) + # we require sortedness and non-null values in the join keys if not Index(left_values).is_monotonic: side = "left" diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 435f592d5703e..118fbb7fdc9e0 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1351,6 +1351,6 @@ def test_non_numerical_dtype(data): right = pd.DataFrame({"x": data}) with pytest.raises( ValueError, - match=r"Incompatible merge key \[0\] .*, both sides must have numeric dtype", + match="Incompatible merge dtype, .*, both sides must have numeric dtype", ): pd.merge_asof(left, right, on="x") From fb55c9f094a0e3ef5cd4fb63081e98c346d42e7c Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 1 Jun 2020 17:17:39 +0200 Subject: [PATCH 11/23] Move dtype check to _get_merge_keys --- pandas/core/reshape/merge.py | 14 ++++++++------ pandas/tests/reshape/merge/test_merge_asof.py | 12 +++++++++++- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 807814c2ca570..a594e2e8d7c4d 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1637,6 +1637,7 @@ def _get_merge_keys(self): # note this function has side effects (left_join_keys, right_join_keys, join_names) = super()._get_merge_keys() + len_by_cols = 0 if self.left_by is None else len(self.left_by) # validate index types are the same for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)): @@ -1662,6 +1663,13 @@ def _get_merge_keys(self): ) raise MergeError(msg) + if i >= len_by_cols: + if is_object_dtype(rk.dtype) or is_object_dtype(lk.dtype): + raise ValueError( + f"Incompatible merge [{i}] dtype, {repr(lk.dtype)} and " + f"{repr(rk.dtype)}, both sides must have numeric dtype" + ) + # validate tolerance; datetime.timedelta or Timedelta if we have a DTI if self.tolerance is not None: @@ -1731,12 +1739,6 @@ def flip(xs) -> np.ndarray: ) tolerance = self.tolerance - if is_object_dtype(right_values.dtype) or is_object_dtype(left_values.dtype): - raise ValueError( - f"Incompatible merge dtype, {repr(left_values.dtype)} and " - f"{repr(right_values.dtype)}, both sides must have numeric dtype" - ) - # we require sortedness and non-null values in the join keys if not Index(left_values).is_monotonic: side = "left" diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 118fbb7fdc9e0..ab06baabec307 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1351,6 +1351,16 @@ def test_non_numerical_dtype(data): right = pd.DataFrame({"x": data}) with pytest.raises( ValueError, - match="Incompatible merge dtype, .*, both sides must have numeric dtype", + match=r"Incompatible merge \[0\] dtype, .*, both sides must have numeric dtype", ): pd.merge_asof(left, right, on="x") + + +def test_some_issues(): + left = pd.DataFrame({"a": ["12", "13", "15"], "left_val1": ["a", "b", "c"]}) + right = pd.DataFrame({"a": ["a", "b", "c"], "left_val": ["d", "e", "f"]}) + with pytest.raises( + ValueError, + match=r"Incompatible merge \[1\] dtype, .*, both sides must have numeric dtype", + ): + pd.merge_asof(left, right, left_on="left_val1", right_on="a", left_by="a", right_by="left_val") From 72fb8b4fcd8133861bcad25da81126e76882b112 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 1 Jun 2020 17:52:18 +0200 Subject: [PATCH 12/23] Run black pandas --- pandas/tests/reshape/merge/test_merge_asof.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index ab06baabec307..36217dc8b77b2 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1363,4 +1363,11 @@ def test_some_issues(): ValueError, match=r"Incompatible merge \[1\] dtype, .*, both sides must have numeric dtype", ): - pd.merge_asof(left, right, left_on="left_val1", right_on="a", left_by="a", right_by="left_val") + pd.merge_asof( + left, + right, + left_on="left_val1", + right_on="a", + left_by="a", + right_by="left_val", + ) From 01039740a5ade6c8244b9fc03032b43be028a1e2 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 1 Jun 2020 18:32:02 +0200 Subject: [PATCH 13/23] Overwrite _maybe_coerce_merge_keys --- pandas/core/reshape/merge.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index a594e2e8d7c4d..80291be2ed80b 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1637,7 +1637,6 @@ def _get_merge_keys(self): # note this function has side effects (left_join_keys, right_join_keys, join_names) = super()._get_merge_keys() - len_by_cols = 0 if self.left_by is None else len(self.left_by) # validate index types are the same for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)): @@ -1663,13 +1662,6 @@ def _get_merge_keys(self): ) raise MergeError(msg) - if i >= len_by_cols: - if is_object_dtype(rk.dtype) or is_object_dtype(lk.dtype): - raise ValueError( - f"Incompatible merge [{i}] dtype, {repr(lk.dtype)} and " - f"{repr(rk.dtype)}, both sides must have numeric dtype" - ) - # validate tolerance; datetime.timedelta or Timedelta if we have a DTI if self.tolerance is not None: @@ -1714,6 +1706,19 @@ def _get_merge_keys(self): return left_join_keys, right_join_keys, join_names + def _maybe_coerce_merge_keys(self): + """Check if the merge keys are not object and call super method.""" + len_by_cols = 0 if self.left_by is None else len(self.left_by) + for i, (lk, rk) in enumerate(zip(self.left_join_keys, self.right_join_keys,)): + if i >= len_by_cols: + if is_object_dtype(rk.dtype) or is_object_dtype(lk.dtype): + raise ValueError( + f"Incompatible merge [{i}] dtype, {repr(lk.dtype)} and " + f"{repr(rk.dtype)}, both sides must have numeric dtype" + ) + + super()._maybe_coerce_merge_keys() + def _get_join_indexers(self): """ return the join indexers """ From cdaa8ef9423a46d5289df6b1e94c0bf441dce790 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 16 Sep 2020 21:59:53 +0200 Subject: [PATCH 14/23] Move whats new entry --- doc/source/whatsnew/v1.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 3992e697db7e4..c3ab4aaa66075 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -345,6 +345,7 @@ Reshaping - Bug in :func:`union_indexes` where input index names are not preserved in some cases. Affects :func:`concat` and :class:`DataFrame` constructor (:issue:`13475`) - Bug in func :meth:`crosstab` when using multiple columns with ``margins=True`` and ``normalize=True`` (:issue:`35144`) - Bug in :meth:`DataFrame.agg` with ``func={'name':}`` incorrectly raising ``TypeError`` when ``DataFrame.columns==['Name']`` (:issue:`36212`) +- :meth:`merge_asof` raises ``ValueError`` instead of cryptic ``TypeError`` in case of non-numerical merge columns (:issue:`29130`) - Sparse From 47e211ca0bc1434c48ee720ef20280e0002e03df Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 23 Sep 2020 23:06:41 +0200 Subject: [PATCH 15/23] Add docs from other pr --- pandas/tests/reshape/merge/test_merge_asof.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 6245707ea514b..653d8336cccf2 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1378,7 +1378,8 @@ def test_non_numerical_dtype(data): pd.merge_asof(left, right, on="x") -def test_some_issues(): +def test_merge_asof_string_column(): + # GH: 29130 left = pd.DataFrame({"a": ["12", "13", "15"], "left_val1": ["a", "b", "c"]}) right = pd.DataFrame({"a": ["a", "b", "c"], "left_val": ["d", "e", "f"]}) with pytest.raises( From 0242f471779b0add26cde22add965883167bba2e Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 23 Sep 2020 23:09:31 +0200 Subject: [PATCH 16/23] Run black pandas --- pandas/core/reshape/merge.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 9f76b981079cf..fb3848893ab48 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1704,7 +1704,12 @@ def _get_merge_keys(self): def _maybe_coerce_merge_keys(self): """Check if the merge keys are not object and call super method.""" len_by_cols = 0 if self.left_by is None else len(self.left_by) - for i, (lk, rk) in enumerate(zip(self.left_join_keys, self.right_join_keys,)): + for i, (lk, rk) in enumerate( + zip( + self.left_join_keys, + self.right_join_keys, + ) + ): if i >= len_by_cols: if is_object_dtype(rk.dtype) or is_object_dtype(lk.dtype): raise ValueError( From ef47721cb1dadd3743335621c4bb7ea2f4ce5221 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 23 Sep 2020 23:10:15 +0200 Subject: [PATCH 17/23] Revert "Add docs from other pr" This reverts commit 47e211ca --- pandas/tests/reshape/merge/test_merge_asof.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 653d8336cccf2..6245707ea514b 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1378,8 +1378,7 @@ def test_non_numerical_dtype(data): pd.merge_asof(left, right, on="x") -def test_merge_asof_string_column(): - # GH: 29130 +def test_some_issues(): left = pd.DataFrame({"a": ["12", "13", "15"], "left_val1": ["a", "b", "c"]}) right = pd.DataFrame({"a": ["a", "b", "c"], "left_val": ["d", "e", "f"]}) with pytest.raises( From 3a66f059d386653731257b833d875383b5f26157 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 29 Dec 2020 22:05:50 +0100 Subject: [PATCH 18/23] Move whatsnew --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 83bff6d7bfb2d..a671bfcaf21ff 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -286,7 +286,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ -- +- :meth:`merge_asof` raises ``ValueError`` instead of cryptic ``TypeError`` in case of non-numerical merge columns (:issue:`29130`) - Sparse From c0ce857cc9c21acb84e34a631da0a650581a20da Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 4 Jan 2021 14:40:53 +0100 Subject: [PATCH 19/23] Move merge asof validation --- pandas/core/reshape/merge.py | 34 +++++++++---------- pandas/tests/reshape/merge/test_merge_asof.py | 13 +++---- 2 files changed, 23 insertions(+), 24 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 602568822db56..2d96e33680b73 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1708,6 +1708,22 @@ def _validate_specification(self): if self.left_by is not None and self.right_by is None: raise MergeError("missing right_by") + lo_dtype = ( + self.left[self.left_on[0]].dtype + if not self.left_index + else self.left.index.dtype + ) + ro_dtype = ( + self.right[self.right_on[0]].dtype + if not self.right_index + else self.right.index.dtype + ) + if is_object_dtype(lo_dtype) or is_object_dtype(ro_dtype): + raise ValueError( + f"Incompatible merge dtype, {repr(ro_dtype)} and " + f"{repr(lo_dtype)}, both sides must have numeric dtype" + ) + # add 'by' to our key-list so we can have it in the # output as a key if self.left_by is not None: @@ -1799,24 +1815,6 @@ def _get_merge_keys(self): return left_join_keys, right_join_keys, join_names - def _maybe_coerce_merge_keys(self): - """Check if the merge keys are not object and call super method.""" - len_by_cols = 0 if self.left_by is None else len(self.left_by) - for i, (lk, rk) in enumerate( - zip( - self.left_join_keys, - self.right_join_keys, - ) - ): - if i >= len_by_cols: - if is_object_dtype(rk.dtype) or is_object_dtype(lk.dtype): - raise ValueError( - f"Incompatible merge [{i}] dtype, {repr(lk.dtype)} and " - f"{repr(rk.dtype)}, both sides must have numeric dtype" - ) - - super()._maybe_coerce_merge_keys() - def _get_join_indexers(self): """ return the join indexers """ diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index f40d024d688b5..33642b4507e31 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1168,12 +1168,12 @@ def test_on_float_by_int(self): tm.assert_frame_equal(result, expected) def test_merge_datatype_error_raises(self): - msg = r"incompatible merge keys \[0\] .*, must be the same type" + msg = r"Incompatible merge dtype, .*, both sides must have numeric dtype" left = pd.DataFrame({"left_val": [1, 5, 10], "a": ["a", "b", "c"]}) right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7], "a": [1, 2, 3, 6, 7]}) - with pytest.raises(MergeError, match=msg): + with pytest.raises(ValueError, match=msg): merge_asof(left, right, on="a") def test_merge_datatype_categorical_error_raises(self): @@ -1379,23 +1379,24 @@ def test_left_index_right_index_tolerance(self): "data", [["2019-06-01 00:09:12", "2019-06-01 00:10:29"], [1.0, "2019-06-01 00:10:29"]], ) -def test_non_numerical_dtype(data): +def test_merge_asof_non_numerical_dtype(data): # GH 29130 left = pd.DataFrame({"x": data}) right = pd.DataFrame({"x": data}) with pytest.raises( ValueError, - match=r"Incompatible merge \[0\] dtype, .*, both sides must have numeric dtype", + match=r"Incompatible merge dtype, .*, both sides must have numeric dtype", ): pd.merge_asof(left, right, on="x") -def test_some_issues(): +def test_merge_asof_non_numerical_dtype_object(): + # GH#29130 left = pd.DataFrame({"a": ["12", "13", "15"], "left_val1": ["a", "b", "c"]}) right = pd.DataFrame({"a": ["a", "b", "c"], "left_val": ["d", "e", "f"]}) with pytest.raises( ValueError, - match=r"Incompatible merge \[1\] dtype, .*, both sides must have numeric dtype", + match=r"Incompatible merge dtype, .*, both sides must have numeric dtype", ): pd.merge_asof( left, From 2c164c5e5fa9d6f58657eee5b9c9da843bfc809a Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 4 Jan 2021 14:42:10 +0100 Subject: [PATCH 20/23] Remove newline --- pandas/core/reshape/merge.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 2d96e33680b73..00149de286cdc 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1749,7 +1749,6 @@ def _get_merge_keys(self): # validate index types are the same for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)): - if not is_dtype_equal(lk.dtype, rk.dtype): if is_categorical_dtype(lk.dtype) and is_categorical_dtype(rk.dtype): # The generic error message is confusing for categoricals. From a00eb9c55169ded3b5df06b98099b2acee81dd9d Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 4 Jan 2021 14:45:24 +0100 Subject: [PATCH 21/23] Parametrize test further --- pandas/tests/reshape/merge/test_merge_asof.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 33642b4507e31..bcd84b2e90886 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1375,19 +1375,22 @@ def test_left_index_right_index_tolerance(self): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "kwargs", [{"on": "x"}, {"left_index": True, "right_index": True}] +) @pytest.mark.parametrize( "data", [["2019-06-01 00:09:12", "2019-06-01 00:10:29"], [1.0, "2019-06-01 00:10:29"]], ) -def test_merge_asof_non_numerical_dtype(data): - # GH 29130 - left = pd.DataFrame({"x": data}) - right = pd.DataFrame({"x": data}) +def test_merge_asof_non_numerical_dtype(kwargs, data): + # GH#29130 + left = pd.DataFrame({"x": data}, index=data) + right = pd.DataFrame({"x": data}, index=data) with pytest.raises( ValueError, match=r"Incompatible merge dtype, .*, both sides must have numeric dtype", ): - pd.merge_asof(left, right, on="x") + pd.merge_asof(left, right, **kwargs) def test_merge_asof_non_numerical_dtype_object(): From 47822330e5f5e4274ac97671986cf7d4adbc4dcf Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 4 Jan 2021 14:52:08 +0100 Subject: [PATCH 22/23] Add comment --- pandas/core/reshape/merge.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 67734a495ef58..ab69e93658a9c 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1708,6 +1708,7 @@ def _validate_specification(self): if self.left_by is not None and self.right_by is None: raise MergeError("missing right_by") + # GH#29130 Check that merge keys do not have dtype object lo_dtype = ( self.left[self.left_on[0]].dtype if not self.left_index From 6ebb7a383fa82db2d0db3dbe357504e633faf884 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 5 Jan 2021 19:33:02 +0100 Subject: [PATCH 23/23] Change Error --- pandas/core/reshape/merge.py | 2 +- pandas/tests/reshape/merge/test_merge_asof.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index ab69e93658a9c..1caf1a2a023da 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1720,7 +1720,7 @@ def _validate_specification(self): else self.right.index.dtype ) if is_object_dtype(lo_dtype) or is_object_dtype(ro_dtype): - raise ValueError( + raise MergeError( f"Incompatible merge dtype, {repr(ro_dtype)} and " f"{repr(lo_dtype)}, both sides must have numeric dtype" ) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index bcd84b2e90886..ecff63b495fbb 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1173,7 +1173,7 @@ def test_merge_datatype_error_raises(self): left = pd.DataFrame({"left_val": [1, 5, 10], "a": ["a", "b", "c"]}) right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7], "a": [1, 2, 3, 6, 7]}) - with pytest.raises(ValueError, match=msg): + with pytest.raises(MergeError, match=msg): merge_asof(left, right, on="a") def test_merge_datatype_categorical_error_raises(self): @@ -1387,7 +1387,7 @@ def test_merge_asof_non_numerical_dtype(kwargs, data): left = pd.DataFrame({"x": data}, index=data) right = pd.DataFrame({"x": data}, index=data) with pytest.raises( - ValueError, + MergeError, match=r"Incompatible merge dtype, .*, both sides must have numeric dtype", ): pd.merge_asof(left, right, **kwargs) @@ -1398,7 +1398,7 @@ def test_merge_asof_non_numerical_dtype_object(): left = pd.DataFrame({"a": ["12", "13", "15"], "left_val1": ["a", "b", "c"]}) right = pd.DataFrame({"a": ["a", "b", "c"], "left_val": ["d", "e", "f"]}) with pytest.raises( - ValueError, + MergeError, match=r"Incompatible merge dtype, .*, both sides must have numeric dtype", ): pd.merge_asof(