Skip to content

Commit 1355df6

Browse files
reidy-pjreback
authored andcommitted
ERR: ValueError when merging on incompatible dtypes (#18674)
1 parent d7d8f2d commit 1355df6

File tree

3 files changed

+102
-19
lines changed

3 files changed

+102
-19
lines changed

doc/source/whatsnew/v0.22.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,7 @@ Other API Changes
190190
- The default NA value for :class:`UInt64Index` has changed from 0 to ``NaN``, which impacts methods that mask with NA, such as ``UInt64Index.where()`` (:issue:`18398`)
191191
- Refactored ``setup.py`` to use ``find_packages`` instead of explicitly listing out all subpackages (:issue:`18535`)
192192
- Rearranged the order of keyword arguments in :func:`read_excel()` to align with :func:`read_csv()` (:pr:`16672`)
193+
- :func:`pandas.merge` now raises a ``ValueError`` when trying to merge on incompatible data types (:issue:`9780`)
193194

194195
.. _whatsnew_0220.deprecations:
195196

pandas/core/reshape/merge.py

+28
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
is_dtype_equal,
2828
is_bool,
2929
is_list_like,
30+
is_datetimelike,
3031
_ensure_int64,
3132
_ensure_float64,
3233
_ensure_object,
@@ -962,6 +963,33 @@ def _maybe_coerce_merge_keys(self):
962963
elif lib.infer_dtype(lk) == lib.infer_dtype(rk):
963964
pass
964965

966+
# Check if we are trying to merge on obviously
967+
# incompatible dtypes GH 9780
968+
elif is_numeric_dtype(lk) and not is_numeric_dtype(rk):
969+
msg = ("You are trying to merge on {lk_dtype} and "
970+
"{rk_dtype} columns. If you wish to proceed "
971+
"you should use pd.concat".format(lk_dtype=lk.dtype,
972+
rk_dtype=rk.dtype))
973+
raise ValueError(msg)
974+
elif not is_numeric_dtype(lk) and is_numeric_dtype(rk):
975+
msg = ("You are trying to merge on {lk_dtype} and "
976+
"{rk_dtype} columns. If you wish to proceed "
977+
"you should use pd.concat".format(lk_dtype=lk.dtype,
978+
rk_dtype=rk.dtype))
979+
raise ValueError(msg)
980+
elif is_datetimelike(lk) and not is_datetimelike(rk):
981+
msg = ("You are trying to merge on {lk_dtype} and "
982+
"{rk_dtype} columns. If you wish to proceed "
983+
"you should use pd.concat".format(lk_dtype=lk.dtype,
984+
rk_dtype=rk.dtype))
985+
raise ValueError(msg)
986+
elif not is_datetimelike(lk) and is_datetimelike(rk):
987+
msg = ("You are trying to merge on {lk_dtype} and "
988+
"{rk_dtype} columns. If you wish to proceed "
989+
"you should use pd.concat".format(lk_dtype=lk.dtype,
990+
rk_dtype=rk.dtype))
991+
raise ValueError(msg)
992+
965993
# Houston, we have a problem!
966994
# let's coerce to object if the dtypes aren't
967995
# categorical, otherwise coerce to the category

pandas/tests/reshape/merge/test_merge.py

+73-19
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from numpy import nan
77
import numpy as np
88
import random
9+
import re
910

1011
import pandas as pd
1112
from pandas.compat import lrange, lzip
@@ -1370,30 +1371,47 @@ def f():
13701371
pytest.raises(NotImplementedError, f)
13711372

13721373

1373-
@pytest.fixture
1374-
def df():
1375-
return DataFrame(
1376-
{'A': ['foo', 'bar'],
1377-
'B': Series(['foo', 'bar']).astype('category'),
1378-
'C': [1, 2],
1379-
'D': [1.0, 2.0],
1380-
'E': Series([1, 2], dtype='uint64'),
1381-
'F': Series([1, 2], dtype='int32')})
1382-
1383-
13841374
class TestMergeDtypes(object):
13851375

1386-
def test_different(self, df):
1387-
1388-
# we expect differences by kind
1389-
# to be ok, while other differences should return object
1390-
1391-
left = df
1392-
for col in df.columns:
1393-
right = DataFrame({'A': df[col]})
1376+
@pytest.mark.parametrize('right_vals', [
1377+
['foo', 'bar'],
1378+
Series(['foo', 'bar']).astype('category'),
1379+
[1, 2],
1380+
[1.0, 2.0],
1381+
Series([1, 2], dtype='uint64'),
1382+
Series([1, 2], dtype='int32')
1383+
]
1384+
)
1385+
def test_different(self, right_vals):
1386+
1387+
left = DataFrame({'A': ['foo', 'bar'],
1388+
'B': Series(['foo', 'bar']).astype('category'),
1389+
'C': [1, 2],
1390+
'D': [1.0, 2.0],
1391+
'E': Series([1, 2], dtype='uint64'),
1392+
'F': Series([1, 2], dtype='int32')})
1393+
right = DataFrame({'A': right_vals})
1394+
1395+
# GH 9780
1396+
# We allow merging on object and categorical cols and cast
1397+
# categorical cols to object
1398+
if (is_categorical_dtype(right['A'].dtype) or
1399+
is_object_dtype(right['A'].dtype)):
13941400
result = pd.merge(left, right, on='A')
13951401
assert is_object_dtype(result.A.dtype)
13961402

1403+
# GH 9780
1404+
# We raise for merging on object col and int/float col and
1405+
# merging on categorical col and int/float col
1406+
else:
1407+
msg = ("You are trying to merge on "
1408+
"{lk_dtype} and {rk_dtype} columns. "
1409+
"If you wish to proceed you should use "
1410+
"pd.concat".format(lk_dtype=left['A'].dtype,
1411+
rk_dtype=right['A'].dtype))
1412+
with tm.assert_raises_regex(ValueError, msg):
1413+
pd.merge(left, right, on='A')
1414+
13971415
@pytest.mark.parametrize('d1', [np.int64, np.int32,
13981416
np.int16, np.int8, np.uint8])
13991417
@pytest.mark.parametrize('d2', [np.int64, np.float64,
@@ -1462,6 +1480,42 @@ def test_merge_on_ints_floats_warning(self):
14621480
result = B.merge(A, left_on='Y', right_on='X')
14631481
assert_frame_equal(result, expected[['Y', 'X']])
14641482

1483+
@pytest.mark.parametrize('df1_vals, df2_vals', [
1484+
([0, 1, 2], ["0", "1", "2"]),
1485+
([0.0, 1.0, 2.0], ["0", "1", "2"]),
1486+
([0, 1, 2], [u"0", u"1", u"2"]),
1487+
(pd.date_range('1/1/2011', periods=2, freq='D'), ['2011-01-01',
1488+
'2011-01-02']),
1489+
(pd.date_range('1/1/2011', periods=2, freq='D'), [0, 1]),
1490+
(pd.date_range('1/1/2011', periods=2, freq='D'), [0.0, 1.0]),
1491+
([0, 1, 2], Series(['a', 'b', 'a']).astype('category')),
1492+
([0.0, 1.0, 2.0], Series(['a', 'b', 'a']).astype('category')),
1493+
])
1494+
def test_merge_incompat_dtypes(self, df1_vals, df2_vals):
1495+
# GH 9780
1496+
# Raise a ValueError when a user tries to merge on
1497+
# dtypes that are incompatible (e.g., obj and int/float)
1498+
1499+
df1 = DataFrame({'A': df1_vals})
1500+
df2 = DataFrame({'A': df2_vals})
1501+
1502+
msg = ("You are trying to merge on {lk_dtype} and "
1503+
"{rk_dtype} columns. If you wish to proceed "
1504+
"you should use pd.concat".format(lk_dtype=df1['A'].dtype,
1505+
rk_dtype=df2['A'].dtype))
1506+
msg = re.escape(msg)
1507+
with tm.assert_raises_regex(ValueError, msg):
1508+
pd.merge(df1, df2, on=['A'])
1509+
1510+
# Check that error still raised when swapping order of dataframes
1511+
msg = ("You are trying to merge on {lk_dtype} and "
1512+
"{rk_dtype} columns. If you wish to proceed "
1513+
"you should use pd.concat".format(lk_dtype=df2['A'].dtype,
1514+
rk_dtype=df1['A'].dtype))
1515+
msg = re.escape(msg)
1516+
with tm.assert_raises_regex(ValueError, msg):
1517+
pd.merge(df2, df1, on=['A'])
1518+
14651519

14661520
@pytest.fixture
14671521
def left():

0 commit comments

Comments
 (0)