Skip to content

Commit f2bd212

Browse files
committed
ERR: ValueError when merging on incompatible dtypes
1 parent 3e506a3 commit f2bd212

File tree

3 files changed

+81
-12
lines changed

3 files changed

+81
-12
lines changed

doc/source/whatsnew/v0.22.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,8 @@ Other API Changes
186186
- :func:`Series.fillna` now raises a ``TypeError`` instead of a ``ValueError`` when passed a list, tuple or DataFrame as a ``value`` (:issue:`18293`)
187187
- :func:`pandas.DataFrame.merge` no longer casts a ``float`` column to ``object`` when merging on ``int`` and ``float`` columns (:issue:`16572`)
188188
- The default NA value for :class:`UInt64Index` has changed from 0 to ``NaN``, which impacts methods that mask with NA, such as ``UInt64Index.where()`` (:issue:`18398`)
189+
- Building pandas for development now requires ``cython >= 0.24`` (:issue:`18613`)
190+
- :func:`pandas.merge` now raises a ``ValueError`` when trying to merge on incompatible data types (:issue:`9780`)
189191

190192
.. _whatsnew_0220.deprecations:
191193

pandas/core/reshape/merge.py

+24-7
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
is_dtype_equal,
2828
is_bool,
2929
is_list_like,
30+
is_datetimelike,
3031
_ensure_int64,
3132
_ensure_float64,
3233
_ensure_object,
@@ -962,14 +963,30 @@ def _maybe_coerce_merge_keys(self):
962963
elif lib.infer_dtype(lk) == lib.infer_dtype(rk):
963964
pass
964965

965-
# Houston, we have a problem!
966-
# let's coerce to object if the dtypes aren't
967-
# categorical, otherwise coerce to the category
968-
# dtype. If we coerced categories to object,
969-
# then we would lose type information on some
970-
# columns, and end up trying to merge
971-
# incompatible dtypes. See GH 16900.
972966
else:
967+
968+
# Check if we are trying to merge on obviously
969+
# incompatible dtypes GH 9780
970+
msg = ("You are trying to merge on {lk_dtype} and "
971+
"{rk_dtype} columns. If you wish to proceed "
972+
"you should use pd.concat".format(lk_dtype=lk.dtype,
973+
rk_dtype=rk.dtype))
974+
if is_numeric_dtype(lk) and not is_numeric_dtype(rk):
975+
raise ValueError(msg)
976+
elif not is_numeric_dtype(lk) and is_numeric_dtype(rk):
977+
raise ValueError(msg)
978+
elif is_datetimelike(lk) and not is_datetimelike(rk):
979+
raise ValueError(msg)
980+
elif not is_datetimelike(lk) and is_datetimelike(rk):
981+
raise ValueError(msg)
982+
983+
# Houston, we have a problem!
984+
# let's coerce to object if the dtypes aren't
985+
# categorical, otherwise coerce to the category
986+
# dtype. If we coerced categories to object,
987+
# then we would lose type information on some
988+
# columns, and end up trying to merge
989+
# incompatible dtypes. See GH 16900.
973990
if name in self.left.columns:
974991
typ = lk.categories.dtype if lk_is_cat else object
975992
self.left = self.left.assign(

pandas/tests/reshape/merge/test_merge.py

+55-5
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from numpy import nan
77
import numpy as np
88
import random
9+
import re
910

1011
import pandas as pd
1112
from pandas.compat import lrange, lzip
@@ -1385,14 +1386,27 @@ class TestMergeDtypes(object):
13851386

13861387
def test_different(self, df):
13871388

1388-
# we expect differences by kind
1389-
# to be ok, while other differences should return object
1390-
13911389
left = df
13921390
for col in df.columns:
13931391
right = DataFrame({'A': df[col]})
1394-
result = pd.merge(left, right, on='A')
1395-
assert is_object_dtype(result.A.dtype)
1392+
# GH 9780
1393+
# We allow merging on object and categorical cols and cast
1394+
# categorical cols to object
1395+
if (is_categorical_dtype(right['A'].dtype) or
1396+
is_object_dtype(right['A'].dtype)):
1397+
result = pd.merge(left, right, on='A')
1398+
assert is_object_dtype(result.A.dtype)
1399+
# GH 9780
1400+
# We raise for merging on object col and int/float col and
1401+
# merging on categorical col and int/float col
1402+
else:
1403+
msg = ("You are trying to merge on "
1404+
"{lk_dtype} and {rk_dtype} columns. "
1405+
"If you wish to proceed you should use "
1406+
"pd.concat".format(lk_dtype=left['A'].dtype,
1407+
rk_dtype=right['A'].dtype))
1408+
with tm.assert_raises_regex(ValueError, msg):
1409+
pd.merge(left, right, on='A')
13961410

13971411
@pytest.mark.parametrize('d1', [np.int64, np.int32,
13981412
np.int16, np.int8, np.uint8])
@@ -1462,6 +1476,42 @@ def test_merge_on_ints_floats_warning(self):
14621476
result = B.merge(A, left_on='Y', right_on='X')
14631477
assert_frame_equal(result, expected[['Y', 'X']])
14641478

1479+
@pytest.mark.parametrize('df1_vals, df2_vals', [
1480+
([0, 1, 2], ["0", "1", "2"]),
1481+
([0.0, 1.0, 2.0], ["0", "1", "2"]),
1482+
([0, 1, 2], [u"0", u"1", u"2"]),
1483+
(pd.date_range('1/1/2011', periods=2, freq='D'), ['2011-01-01',
1484+
'2011-01-02']),
1485+
(pd.date_range('1/1/2011', periods=2, freq='D'), [0, 1]),
1486+
(pd.date_range('1/1/2011', periods=2, freq='D'), [0.0, 1.0]),
1487+
([0, 1, 2], Series(['a', 'b', 'a']).astype('category')),
1488+
([0.0, 1.0, 2.0], Series(['a', 'b', 'a']).astype('category')),
1489+
])
1490+
def test_merge_incompat_dtypes(self, df1_vals, df2_vals):
1491+
# GH 9780
1492+
# Raise a ValueError when a user tries to merge on
1493+
# dtypes that are incompatible (e.g., obj and int/float)
1494+
1495+
df1 = DataFrame({'A': df1_vals})
1496+
df2 = DataFrame({'A': df2_vals})
1497+
1498+
msg = ("You are trying to merge on {lk_dtype} and "
1499+
"{rk_dtype} columns. If you wish to proceed "
1500+
"you should use pd.concat".format(lk_dtype=df1['A'].dtype,
1501+
rk_dtype=df2['A'].dtype))
1502+
msg = re.escape(msg)
1503+
with tm.assert_raises_regex(ValueError, msg):
1504+
pd.merge(df1, df2, on=['A'])
1505+
1506+
# Check that error still raised when swapping order of dataframes
1507+
msg = ("You are trying to merge on {lk_dtype} and "
1508+
"{rk_dtype} columns. If you wish to proceed "
1509+
"you should use pd.concat".format(lk_dtype=df2['A'].dtype,
1510+
rk_dtype=df1['A'].dtype))
1511+
msg = re.escape(msg)
1512+
with tm.assert_raises_regex(ValueError, msg):
1513+
pd.merge(df2, df1, on=['A'])
1514+
14651515

14661516
@pytest.fixture
14671517
def left():

0 commit comments

Comments
 (0)