Skip to content

Commit 819418e

Browse files
jorisvandenbosschejreback
authored andcommitted
Allow merging on object / non-object column (#21681)
1 parent c08de6b commit 819418e

File tree

3 files changed

+75
-45
lines changed

3 files changed

+75
-45
lines changed

doc/source/whatsnew/v0.24.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -432,6 +432,7 @@ Backwards incompatible API changes
432432
- The column order of the resultant :class:`DataFrame` from :meth:`MultiIndex.to_frame` is now guaranteed to match the :attr:`MultiIndex.names` order. (:issue:`22420`)
433433
- Incorrectly passing a :class:`DatetimeIndex` to :meth:`MultiIndex.from_tuples`, rather than a sequence of tuples, now raises a ``TypeError`` rather than a ``ValueError`` (:issue:`24024`)
434434
- :func:`pd.offsets.generate_range` argument ``time_rule`` has been removed; use ``offset`` instead (:issue:`24157`)
435+
- In 0.23.x, pandas would raise a ``ValueError`` on a merge of a numeric column (e.g. ``int`` dtyped column) and an ``object`` dtyped column (:issue:`9780`). We have re-enabled the ability to merge ``object`` and other dtypes (:issue:`21681`)
435436

436437
Percentage change on groupby
437438
^^^^^^^^^^^^^^^^^^^^^^^^^^^^

pandas/core/reshape/merge.py

+44-20
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
is_datetime64tz_dtype, is_datetimelike, is_dtype_equal,
2121
is_extension_array_dtype, is_float_dtype, is_int64_dtype, is_integer,
2222
is_integer_dtype, is_list_like, is_number, is_numeric_dtype,
23-
needs_i8_conversion)
23+
is_object_dtype, needs_i8_conversion)
2424
from pandas.core.dtypes.missing import isnull, na_value_for_dtype
2525

2626
from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timedelta
@@ -901,6 +901,8 @@ def _maybe_coerce_merge_keys(self):
901901

902902
lk_is_cat = is_categorical_dtype(lk)
903903
rk_is_cat = is_categorical_dtype(rk)
904+
lk_is_object = is_object_dtype(lk)
905+
rk_is_object = is_object_dtype(rk)
904906

905907
# if either left or right is a categorical
906908
# then the must match exactly in categories & ordered
@@ -925,7 +927,7 @@ def _maybe_coerce_merge_keys(self):
925927
# the same, then proceed
926928
if is_numeric_dtype(lk) and is_numeric_dtype(rk):
927929
if lk.dtype.kind == rk.dtype.kind:
928-
pass
930+
continue
929931

930932
# check whether ints and floats
931933
elif is_integer_dtype(rk) and is_float_dtype(lk):
@@ -934,29 +936,49 @@ def _maybe_coerce_merge_keys(self):
934936
'columns where the float values '
935937
'are not equal to their int '
936938
'representation', UserWarning)
939+
continue
937940

938941
elif is_float_dtype(rk) and is_integer_dtype(lk):
939942
if not (rk == rk.astype(lk.dtype))[~np.isnan(rk)].all():
940943
warnings.warn('You are merging on int and float '
941944
'columns where the float values '
942945
'are not equal to their int '
943946
'representation', UserWarning)
947+
continue
944948

945949
# let's infer and see if we are ok
946950
elif lib.infer_dtype(lk) == lib.infer_dtype(rk):
947-
pass
951+
continue
948952

949953
# Check if we are trying to merge on obviously
950954
# incompatible dtypes GH 9780, GH 15800
951955

952-
# boolean values are considered as numeric, but are still allowed
953-
# to be merged on object boolean values
954-
elif ((is_numeric_dtype(lk) and not is_bool_dtype(lk))
955-
and not is_numeric_dtype(rk)):
956-
raise ValueError(msg)
957-
elif (not is_numeric_dtype(lk)
958-
and (is_numeric_dtype(rk) and not is_bool_dtype(rk))):
959-
raise ValueError(msg)
956+
# bool values are coerced to object
957+
elif ((lk_is_object and is_bool_dtype(rk)) or
958+
(is_bool_dtype(lk) and rk_is_object)):
959+
pass
960+
961+
# object values are allowed to be merged
962+
elif ((lk_is_object and is_numeric_dtype(rk)) or
963+
(is_numeric_dtype(lk) and rk_is_object)):
964+
inferred_left = lib.infer_dtype(lk)
965+
inferred_right = lib.infer_dtype(rk)
966+
bool_types = ['integer', 'mixed-integer', 'boolean', 'empty']
967+
string_types = ['string', 'unicode', 'mixed', 'bytes', 'empty']
968+
969+
# inferred bool
970+
if (inferred_left in bool_types and
971+
inferred_right in bool_types):
972+
pass
973+
974+
# unless we are merging non-string-like with string-like
975+
elif ((inferred_left in string_types and
976+
inferred_right not in string_types) or
977+
(inferred_right in string_types and
978+
inferred_left not in string_types)):
979+
raise ValueError(msg)
980+
981+
# datetimelikes must match exactly
960982
elif is_datetimelike(lk) and not is_datetimelike(rk):
961983
raise ValueError(msg)
962984
elif not is_datetimelike(lk) and is_datetimelike(rk):
@@ -966,22 +988,24 @@ def _maybe_coerce_merge_keys(self):
966988
elif not is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk):
967989
raise ValueError(msg)
968990

991+
elif lk_is_object and rk_is_object:
992+
continue
993+
969994
# Houston, we have a problem!
970995
# let's coerce to object if the dtypes aren't
971996
# categorical, otherwise coerce to the category
972997
# dtype. If we coerced categories to object,
973998
# then we would lose type information on some
974999
# columns, and end up trying to merge
9751000
# incompatible dtypes. See GH 16900.
976-
else:
977-
if name in self.left.columns:
978-
typ = lk.categories.dtype if lk_is_cat else object
979-
self.left = self.left.assign(
980-
**{name: self.left[name].astype(typ)})
981-
if name in self.right.columns:
982-
typ = rk.categories.dtype if rk_is_cat else object
983-
self.right = self.right.assign(
984-
**{name: self.right[name].astype(typ)})
1001+
if name in self.left.columns:
1002+
typ = lk.categories.dtype if lk_is_cat else object
1003+
self.left = self.left.assign(
1004+
**{name: self.left[name].astype(typ)})
1005+
if name in self.right.columns:
1006+
typ = rk.categories.dtype if rk_is_cat else object
1007+
self.right = self.right.assign(
1008+
**{name: self.right[name].astype(typ)})
9851009

9861010
def _validate_specification(self):
9871011
# Hm, any way to make this logic less complicated??

pandas/tests/reshape/merge/test_merge.py

+30-25
Original file line numberDiff line numberDiff line change
@@ -926,10 +926,6 @@ class TestMergeDtypes(object):
926926
@pytest.mark.parametrize('right_vals', [
927927
['foo', 'bar'],
928928
Series(['foo', 'bar']).astype('category'),
929-
[1, 2],
930-
[1.0, 2.0],
931-
Series([1, 2], dtype='uint64'),
932-
Series([1, 2], dtype='int32')
933929
])
934930
def test_different(self, right_vals):
935931

@@ -944,22 +940,8 @@ def test_different(self, right_vals):
944940
# GH 9780
945941
# We allow merging on object and categorical cols and cast
946942
# categorical cols to object
947-
if (is_categorical_dtype(right['A'].dtype) or
948-
is_object_dtype(right['A'].dtype)):
949-
result = pd.merge(left, right, on='A')
950-
assert is_object_dtype(result.A.dtype)
951-
952-
# GH 9780
953-
# We raise for merging on object col and int/float col and
954-
# merging on categorical col and int/float col
955-
else:
956-
msg = ("You are trying to merge on "
957-
"{lk_dtype} and {rk_dtype} columns. "
958-
"If you wish to proceed you should use "
959-
"pd.concat".format(lk_dtype=left['A'].dtype,
960-
rk_dtype=right['A'].dtype))
961-
with pytest.raises(ValueError, match=msg):
962-
pd.merge(left, right, on='A')
943+
result = pd.merge(left, right, on='A')
944+
assert is_object_dtype(result.A.dtype)
963945

964946
@pytest.mark.parametrize('d1', [np.int64, np.int32,
965947
np.int16, np.int8, np.uint8])
@@ -1058,6 +1040,33 @@ def test_merge_incompat_infer_boolean_object(self):
10581040
assert_frame_equal(result, expected)
10591041

10601042
@pytest.mark.parametrize('df1_vals, df2_vals', [
1043+
1044+
# merge on category coerces to object
1045+
([0, 1, 2], Series(['a', 'b', 'a']).astype('category')),
1046+
([0.0, 1.0, 2.0], Series(['a', 'b', 'a']).astype('category')),
1047+
1048+
# no not infer
1049+
([0, 1], pd.Series([False, True], dtype=object)),
1050+
([0, 1], pd.Series([False, True], dtype=bool)),
1051+
])
1052+
def test_merge_incompat_dtypes_are_ok(self, df1_vals, df2_vals):
1053+
# these are explicity allowed incompat merges, that pass thru
1054+
# the result type is dependent on if the values on the rhs are
1055+
# inferred, otherwise these will be coereced to object
1056+
1057+
df1 = DataFrame({'A': df1_vals})
1058+
df2 = DataFrame({'A': df2_vals})
1059+
1060+
result = pd.merge(df1, df2, on=['A'])
1061+
assert is_object_dtype(result.A.dtype)
1062+
result = pd.merge(df2, df1, on=['A'])
1063+
assert is_object_dtype(result.A.dtype)
1064+
1065+
@pytest.mark.parametrize('df1_vals, df2_vals', [
1066+
# do not infer to numeric
1067+
1068+
(Series([1, 2], dtype='uint64'), ["a", "b", "c"]),
1069+
(Series([1, 2], dtype='int32'), ["a", "b", "c"]),
10611070
([0, 1, 2], ["0", "1", "2"]),
10621071
([0.0, 1.0, 2.0], ["0", "1", "2"]),
10631072
([0, 1, 2], [u"0", u"1", u"2"]),
@@ -1067,12 +1076,8 @@ def test_merge_incompat_infer_boolean_object(self):
10671076
(pd.date_range('1/1/2011', periods=2, freq='D'), [0.0, 1.0]),
10681077
(pd.date_range('20130101', periods=3),
10691078
pd.date_range('20130101', periods=3, tz='US/Eastern')),
1070-
([0, 1, 2], Series(['a', 'b', 'a']).astype('category')),
1071-
([0.0, 1.0, 2.0], Series(['a', 'b', 'a']).astype('category')),
1072-
# TODO ([0, 1], pd.Series([False, True], dtype=bool)),
1073-
([0, 1], pd.Series([False, True], dtype=object))
10741079
])
1075-
def test_merge_incompat_dtypes(self, df1_vals, df2_vals):
1080+
def test_merge_incompat_dtypes_error(self, df1_vals, df2_vals):
10761081
# GH 9780, GH 15800
10771082
# Raise a ValueError when a user tries to merge on
10781083
# dtypes that are incompatible (e.g., obj and int/float)

0 commit comments

Comments
 (0)