pandas-dev · jreback · Dec 10, 2017 · Dec 5, 2017 · Dec 6, 2017 · Dec 7, 2017
diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt
@@ -189,6 +189,7 @@ Other API Changes
 - The default NA value for :class:`UInt64Index` has changed from 0 to ``NaN``, which impacts methods that mask with NA, such as ``UInt64Index.where()`` (:issue:`18398`)
 - Refactored ``setup.py`` to use ``find_packages`` instead of explicitly listing out all subpackages (:issue:`18535`)
 - Rearranged the order of keyword arguments in :func:`read_excel()` to align with :func:`read_csv()` (:pr:`16672`)
+- :func:`pandas.merge` now raises a ``ValueError`` when trying to merge on incompatible data types (:issue:`9780`)
 
 .. _whatsnew_0220.deprecations:
 

diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -27,6 +27,7 @@
     is_dtype_equal,
     is_bool,
     is_list_like,
+    is_datetimelike,
     _ensure_int64,
     _ensure_float64,
     _ensure_object,
@@ -962,6 +963,33 @@ def _maybe_coerce_merge_keys(self):
                 elif lib.infer_dtype(lk) == lib.infer_dtype(rk):
                     pass
 
+            # Check if we are trying to merge on obviously
+            # incompatible dtypes GH 9780
+            elif is_numeric_dtype(lk) and not is_numeric_dtype(rk):
+                msg = ("You are trying to merge on {lk_dtype} and "
+                       "{rk_dtype} columns. If you wish to proceed "
+                       "you should use pd.concat".format(lk_dtype=lk.dtype,
+                                                         rk_dtype=rk.dtype))
+                raise ValueError(msg)
+            elif not is_numeric_dtype(lk) and is_numeric_dtype(rk):
+                msg = ("You are trying to merge on {lk_dtype} and "
+                       "{rk_dtype} columns. If you wish to proceed "
+                       "you should use pd.concat".format(lk_dtype=lk.dtype,
+                                                         rk_dtype=rk.dtype))
+                raise ValueError(msg)
+            elif is_datetimelike(lk) and not is_datetimelike(rk):
+                msg = ("You are trying to merge on {lk_dtype} and "
+                       "{rk_dtype} columns. If you wish to proceed "
+                       "you should use pd.concat".format(lk_dtype=lk.dtype,
+                                                         rk_dtype=rk.dtype))
+                raise ValueError(msg)
+            elif not is_datetimelike(lk) and is_datetimelike(rk):
+                msg = ("You are trying to merge on {lk_dtype} and "
+                       "{rk_dtype} columns. If you wish to proceed "
+                       "you should use pd.concat".format(lk_dtype=lk.dtype,
+                                                         rk_dtype=rk.dtype))
+                raise ValueError(msg)
+
             # Houston, we have a problem!
             # let's coerce to object if the dtypes aren't
             # categorical, otherwise coerce to the category

diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
@@ -6,6 +6,7 @@
 from numpy import nan
 import numpy as np
 import random
+import re
 
 import pandas as pd
 from pandas.compat import lrange, lzip
@@ -1370,30 +1371,47 @@ def f():
         pytest.raises(NotImplementedError, f)
 
 
-@pytest.fixture
-def df():
-    return DataFrame(
-        {'A': ['foo', 'bar'],
-         'B': Series(['foo', 'bar']).astype('category'),
-         'C': [1, 2],
-         'D': [1.0, 2.0],
-         'E': Series([1, 2], dtype='uint64'),
-         'F': Series([1, 2], dtype='int32')})
-
-
 class TestMergeDtypes(object):
 
-    def test_different(self, df):
-
-        # we expect differences by kind
-        # to be ok, while other differences should return object
-
-        left = df
-        for col in df.columns:
-            right = DataFrame({'A': df[col]})
+    @pytest.mark.parametrize('right_vals', [
+        ['foo', 'bar'],
+        Series(['foo', 'bar']).astype('category'),
+        [1, 2],
+        [1.0, 2.0],
+        Series([1, 2], dtype='uint64'),
+        Series([1, 2], dtype='int32')
+    ]
+    )
+    def test_different(self, right_vals):
+
+        left = DataFrame({'A': ['foo', 'bar'],
+                          'B': Series(['foo', 'bar']).astype('category'),
+                          'C': [1, 2],
+                          'D': [1.0, 2.0],
+                          'E': Series([1, 2], dtype='uint64'),
+                          'F': Series([1, 2], dtype='int32')})
+        right = DataFrame({'A': right_vals})
+
+        # GH 9780
+        # We allow merging on object and categorical cols and cast
+        # categorical cols to object
+        if (is_categorical_dtype(right['A'].dtype) or
+           is_object_dtype(right['A'].dtype)):
             result = pd.merge(left, right, on='A')
             assert is_object_dtype(result.A.dtype)
 
+        # GH 9780
+        # We raise for merging on object col and int/float col and
+        # merging on categorical col and int/float col
+        else:
+            msg = ("You are trying to merge on "
+                   "{lk_dtype} and {rk_dtype} columns. "
+                   "If you wish to proceed you should use "
+                   "pd.concat".format(lk_dtype=left['A'].dtype,
+                                      rk_dtype=right['A'].dtype))
+            with tm.assert_raises_regex(ValueError, msg):
+                pd.merge(left, right, on='A')
+
     @pytest.mark.parametrize('d1', [np.int64, np.int32,
                                     np.int16, np.int8, np.uint8])
     @pytest.mark.parametrize('d2', [np.int64, np.float64,
@@ -1462,6 +1480,42 @@ def test_merge_on_ints_floats_warning(self):
             result = B.merge(A, left_on='Y', right_on='X')
             assert_frame_equal(result, expected[['Y', 'X']])
 
+    @pytest.mark.parametrize('df1_vals, df2_vals', [
+        ([0, 1, 2], ["0", "1", "2"]),
+        ([0.0, 1.0, 2.0], ["0", "1", "2"]),
+        ([0, 1, 2], [u"0", u"1", u"2"]),
+        (pd.date_range('1/1/2011', periods=2, freq='D'), ['2011-01-01',
+                                                          '2011-01-02']),
+        (pd.date_range('1/1/2011', periods=2, freq='D'), [0, 1]),
+        (pd.date_range('1/1/2011', periods=2, freq='D'), [0.0, 1.0]),
+        ([0, 1, 2], Series(['a', 'b', 'a']).astype('category')),
+        ([0.0, 1.0, 2.0], Series(['a', 'b', 'a']).astype('category')),
+    ])
+    def test_merge_incompat_dtypes(self, df1_vals, df2_vals):
+        # GH 9780
+        # Raise a ValueError when a user tries to merge on
+        # dtypes that are incompatible (e.g., obj and int/float)
+
+        df1 = DataFrame({'A': df1_vals})
+        df2 = DataFrame({'A': df2_vals})
+
+        msg = ("You are trying to merge on {lk_dtype} and "
+               "{rk_dtype} columns. If you wish to proceed "
+               "you should use pd.concat".format(lk_dtype=df1['A'].dtype,
+                                                 rk_dtype=df2['A'].dtype))
+        msg = re.escape(msg)
+        with tm.assert_raises_regex(ValueError, msg):
+            pd.merge(df1, df2, on=['A'])
+
+        # Check that error still raised when swapping order of dataframes
+        msg = ("You are trying to merge on {lk_dtype} and "
+               "{rk_dtype} columns. If you wish to proceed "
+               "you should use pd.concat".format(lk_dtype=df2['A'].dtype,
+                                                 rk_dtype=df1['A'].dtype))
+        msg = re.escape(msg)
+        with tm.assert_raises_regex(ValueError, msg):
+            pd.merge(df2, df1, on=['A'])
+
 
 @pytest.fixture
 def left():