From a6cb4b9a9d5e747e9920d4362fe3b4c813a36fc0 Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Tue, 5 Dec 2017 07:26:50 +0000 Subject: [PATCH 1/3] ERR: ValueError when merging on incompatible dtypes --- doc/source/whatsnew/v0.22.0.txt | 1 + pandas/core/reshape/merge.py | 31 +++++++++--- pandas/tests/reshape/merge/test_merge.py | 60 ++++++++++++++++++++++-- 3 files changed, 80 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 32b548e5f32f1..0baa0a307c988 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -189,6 +189,7 @@ Other API Changes - The default NA value for :class:`UInt64Index` has changed from 0 to ``NaN``, which impacts methods that mask with NA, such as ``UInt64Index.where()`` (:issue:`18398`) - Refactored ``setup.py`` to use ``find_packages`` instead of explicitly listing out all subpackages (:issue:`18535`) - Rearranged the order of keyword arguments in :func:`read_excel()` to align with :func:`read_csv()` (:pr:`16672`) +- :func:`pandas.merge` now raises a ``ValueError`` when trying to merge on incompatible data types (:issue:`9780`) .. _whatsnew_0220.deprecations: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index bad7088a126cf..53fc686389623 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -27,6 +27,7 @@ is_dtype_equal, is_bool, is_list_like, + is_datetimelike, _ensure_int64, _ensure_float64, _ensure_object, @@ -962,14 +963,30 @@ def _maybe_coerce_merge_keys(self): elif lib.infer_dtype(lk) == lib.infer_dtype(rk): pass - # Houston, we have a problem! - # let's coerce to object if the dtypes aren't - # categorical, otherwise coerce to the category - # dtype. If we coerced categories to object, - # then we would lose type information on some - # columns, and end up trying to merge - # incompatible dtypes. See GH 16900. else: + + # Check if we are trying to merge on obviously + # incompatible dtypes GH 9780 + msg = ("You are trying to merge on {lk_dtype} and " + "{rk_dtype} columns. If you wish to proceed " + "you should use pd.concat".format(lk_dtype=lk.dtype, + rk_dtype=rk.dtype)) + if is_numeric_dtype(lk) and not is_numeric_dtype(rk): + raise ValueError(msg) + elif not is_numeric_dtype(lk) and is_numeric_dtype(rk): + raise ValueError(msg) + elif is_datetimelike(lk) and not is_datetimelike(rk): + raise ValueError(msg) + elif not is_datetimelike(lk) and is_datetimelike(rk): + raise ValueError(msg) + + # Houston, we have a problem! + # let's coerce to object if the dtypes aren't + # categorical, otherwise coerce to the category + # dtype. If we coerced categories to object, + # then we would lose type information on some + # columns, and end up trying to merge + # incompatible dtypes. See GH 16900. if name in self.left.columns: typ = lk.categories.dtype if lk_is_cat else object self.left = self.left.assign( diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 6f2d2ce2a8583..8b395c29124de 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -6,6 +6,7 @@ from numpy import nan import numpy as np import random +import re import pandas as pd from pandas.compat import lrange, lzip @@ -1385,14 +1386,27 @@ class TestMergeDtypes(object): def test_different(self, df): - # we expect differences by kind - # to be ok, while other differences should return object - left = df for col in df.columns: right = DataFrame({'A': df[col]}) - result = pd.merge(left, right, on='A') - assert is_object_dtype(result.A.dtype) + # GH 9780 + # We allow merging on object and categorical cols and cast + # categorical cols to object + if (is_categorical_dtype(right['A'].dtype) or + is_object_dtype(right['A'].dtype)): + result = pd.merge(left, right, on='A') + assert is_object_dtype(result.A.dtype) + # GH 9780 + # We raise for merging on object col and int/float col and + # merging on categorical col and int/float col + else: + msg = ("You are trying to merge on " + "{lk_dtype} and {rk_dtype} columns. " + "If you wish to proceed you should use " + "pd.concat".format(lk_dtype=left['A'].dtype, + rk_dtype=right['A'].dtype)) + with tm.assert_raises_regex(ValueError, msg): + pd.merge(left, right, on='A') @pytest.mark.parametrize('d1', [np.int64, np.int32, np.int16, np.int8, np.uint8]) @@ -1462,6 +1476,42 @@ def test_merge_on_ints_floats_warning(self): result = B.merge(A, left_on='Y', right_on='X') assert_frame_equal(result, expected[['Y', 'X']]) + @pytest.mark.parametrize('df1_vals, df2_vals', [ + ([0, 1, 2], ["0", "1", "2"]), + ([0.0, 1.0, 2.0], ["0", "1", "2"]), + ([0, 1, 2], [u"0", u"1", u"2"]), + (pd.date_range('1/1/2011', periods=2, freq='D'), ['2011-01-01', + '2011-01-02']), + (pd.date_range('1/1/2011', periods=2, freq='D'), [0, 1]), + (pd.date_range('1/1/2011', periods=2, freq='D'), [0.0, 1.0]), + ([0, 1, 2], Series(['a', 'b', 'a']).astype('category')), + ([0.0, 1.0, 2.0], Series(['a', 'b', 'a']).astype('category')), + ]) + def test_merge_incompat_dtypes(self, df1_vals, df2_vals): + # GH 9780 + # Raise a ValueError when a user tries to merge on + # dtypes that are incompatible (e.g., obj and int/float) + + df1 = DataFrame({'A': df1_vals}) + df2 = DataFrame({'A': df2_vals}) + + msg = ("You are trying to merge on {lk_dtype} and " + "{rk_dtype} columns. If you wish to proceed " + "you should use pd.concat".format(lk_dtype=df1['A'].dtype, + rk_dtype=df2['A'].dtype)) + msg = re.escape(msg) + with tm.assert_raises_regex(ValueError, msg): + pd.merge(df1, df2, on=['A']) + + # Check that error still raised when swapping order of dataframes + msg = ("You are trying to merge on {lk_dtype} and " + "{rk_dtype} columns. If you wish to proceed " + "you should use pd.concat".format(lk_dtype=df2['A'].dtype, + rk_dtype=df1['A'].dtype)) + msg = re.escape(msg) + with tm.assert_raises_regex(ValueError, msg): + pd.merge(df2, df1, on=['A']) + @pytest.fixture def left(): From 948f6cfd0ae9d66abba667b684a7a69b9b785aa2 Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Wed, 6 Dec 2017 23:51:35 +0000 Subject: [PATCH 2/3] remove whitespace --- pandas/tests/reshape/merge/test_merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 8b395c29124de..0fca756089592 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1390,7 +1390,7 @@ def test_different(self, df): for col in df.columns: right = DataFrame({'A': df[col]}) # GH 9780 - # We allow merging on object and categorical cols and cast + # We allow merging on object and categorical cols and cast # categorical cols to object if (is_categorical_dtype(right['A'].dtype) or is_object_dtype(right['A'].dtype)): From 0ca53a5ea56712f81668b07bff8f7628c4a20816 Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Thu, 7 Dec 2017 10:43:38 +0000 Subject: [PATCH 3/3] use elifs and parametrize tests --- pandas/core/reshape/merge.py | 51 ++++++++++------- pandas/tests/reshape/merge/test_merge.py | 70 +++++++++++++----------- 2 files changed, 68 insertions(+), 53 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 53fc686389623..455c6f42ac74a 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -963,30 +963,41 @@ def _maybe_coerce_merge_keys(self): elif lib.infer_dtype(lk) == lib.infer_dtype(rk): pass - else: - - # Check if we are trying to merge on obviously - # incompatible dtypes GH 9780 + # Check if we are trying to merge on obviously + # incompatible dtypes GH 9780 + elif is_numeric_dtype(lk) and not is_numeric_dtype(rk): + msg = ("You are trying to merge on {lk_dtype} and " + "{rk_dtype} columns. If you wish to proceed " + "you should use pd.concat".format(lk_dtype=lk.dtype, + rk_dtype=rk.dtype)) + raise ValueError(msg) + elif not is_numeric_dtype(lk) and is_numeric_dtype(rk): msg = ("You are trying to merge on {lk_dtype} and " "{rk_dtype} columns. If you wish to proceed " "you should use pd.concat".format(lk_dtype=lk.dtype, rk_dtype=rk.dtype)) - if is_numeric_dtype(lk) and not is_numeric_dtype(rk): - raise ValueError(msg) - elif not is_numeric_dtype(lk) and is_numeric_dtype(rk): - raise ValueError(msg) - elif is_datetimelike(lk) and not is_datetimelike(rk): - raise ValueError(msg) - elif not is_datetimelike(lk) and is_datetimelike(rk): - raise ValueError(msg) - - # Houston, we have a problem! - # let's coerce to object if the dtypes aren't - # categorical, otherwise coerce to the category - # dtype. If we coerced categories to object, - # then we would lose type information on some - # columns, and end up trying to merge - # incompatible dtypes. See GH 16900. + raise ValueError(msg) + elif is_datetimelike(lk) and not is_datetimelike(rk): + msg = ("You are trying to merge on {lk_dtype} and " + "{rk_dtype} columns. If you wish to proceed " + "you should use pd.concat".format(lk_dtype=lk.dtype, + rk_dtype=rk.dtype)) + raise ValueError(msg) + elif not is_datetimelike(lk) and is_datetimelike(rk): + msg = ("You are trying to merge on {lk_dtype} and " + "{rk_dtype} columns. If you wish to proceed " + "you should use pd.concat".format(lk_dtype=lk.dtype, + rk_dtype=rk.dtype)) + raise ValueError(msg) + + # Houston, we have a problem! + # let's coerce to object if the dtypes aren't + # categorical, otherwise coerce to the category + # dtype. If we coerced categories to object, + # then we would lose type information on some + # columns, and end up trying to merge + # incompatible dtypes. See GH 16900. + else: if name in self.left.columns: typ = lk.categories.dtype if lk_is_cat else object self.left = self.left.assign( diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 0fca756089592..70b84f7a6225b 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1371,42 +1371,46 @@ def f(): pytest.raises(NotImplementedError, f) -@pytest.fixture -def df(): - return DataFrame( - {'A': ['foo', 'bar'], - 'B': Series(['foo', 'bar']).astype('category'), - 'C': [1, 2], - 'D': [1.0, 2.0], - 'E': Series([1, 2], dtype='uint64'), - 'F': Series([1, 2], dtype='int32')}) +class TestMergeDtypes(object): + @pytest.mark.parametrize('right_vals', [ + ['foo', 'bar'], + Series(['foo', 'bar']).astype('category'), + [1, 2], + [1.0, 2.0], + Series([1, 2], dtype='uint64'), + Series([1, 2], dtype='int32') + ] + ) + def test_different(self, right_vals): + + left = DataFrame({'A': ['foo', 'bar'], + 'B': Series(['foo', 'bar']).astype('category'), + 'C': [1, 2], + 'D': [1.0, 2.0], + 'E': Series([1, 2], dtype='uint64'), + 'F': Series([1, 2], dtype='int32')}) + right = DataFrame({'A': right_vals}) -class TestMergeDtypes(object): + # GH 9780 + # We allow merging on object and categorical cols and cast + # categorical cols to object + if (is_categorical_dtype(right['A'].dtype) or + is_object_dtype(right['A'].dtype)): + result = pd.merge(left, right, on='A') + assert is_object_dtype(result.A.dtype) - def test_different(self, df): - - left = df - for col in df.columns: - right = DataFrame({'A': df[col]}) - # GH 9780 - # We allow merging on object and categorical cols and cast - # categorical cols to object - if (is_categorical_dtype(right['A'].dtype) or - is_object_dtype(right['A'].dtype)): - result = pd.merge(left, right, on='A') - assert is_object_dtype(result.A.dtype) - # GH 9780 - # We raise for merging on object col and int/float col and - # merging on categorical col and int/float col - else: - msg = ("You are trying to merge on " - "{lk_dtype} and {rk_dtype} columns. " - "If you wish to proceed you should use " - "pd.concat".format(lk_dtype=left['A'].dtype, - rk_dtype=right['A'].dtype)) - with tm.assert_raises_regex(ValueError, msg): - pd.merge(left, right, on='A') + # GH 9780 + # We raise for merging on object col and int/float col and + # merging on categorical col and int/float col + else: + msg = ("You are trying to merge on " + "{lk_dtype} and {rk_dtype} columns. " + "If you wish to proceed you should use " + "pd.concat".format(lk_dtype=left['A'].dtype, + rk_dtype=right['A'].dtype)) + with tm.assert_raises_regex(ValueError, msg): + pd.merge(left, right, on='A') @pytest.mark.parametrize('d1', [np.int64, np.int32, np.int16, np.int8, np.uint8])