diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index b4331aab3085f..e8ede92c32374 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -426,6 +426,7 @@ Backwards incompatible API changes - :func:`read_csv` will now raise a ``ValueError`` if a column with missing values is declared as having dtype ``bool`` (:issue:`20591`) - The column order of the resultant :class:`DataFrame` from :meth:`MultiIndex.to_frame` is now guaranteed to match the :attr:`MultiIndex.names` order. (:issue:`22420`) - :func:`pd.offsets.generate_range` argument ``time_rule`` has been removed; use ``offset`` instead (:issue:`24157`) +- In 0.23.x, pandas would raise a ``ValueError`` on a merge of a numeric column (e.g. ``int`` dtyped column) and an ``object`` dtyped column (:issue:`9780`). We have re-enabled the ability to merge ``object`` and other dtypes (:issue:`21681`) Percentage change on groupby ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 58344c0ec9ec7..130bc2b080c72 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -20,7 +20,7 @@ is_datetime64tz_dtype, is_datetimelike, is_dtype_equal, is_extension_array_dtype, is_float_dtype, is_int64_dtype, is_integer, is_integer_dtype, is_list_like, is_number, is_numeric_dtype, - needs_i8_conversion) + is_object_dtype, needs_i8_conversion) from pandas.core.dtypes.missing import isnull, na_value_for_dtype from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timedelta @@ -901,6 +901,8 @@ def _maybe_coerce_merge_keys(self): lk_is_cat = is_categorical_dtype(lk) rk_is_cat = is_categorical_dtype(rk) + lk_is_object = is_object_dtype(lk) + rk_is_object = is_object_dtype(rk) # if either left or right is a categorical # then the must match exactly in categories & ordered @@ -925,7 +927,7 @@ def _maybe_coerce_merge_keys(self): # the same, then proceed if is_numeric_dtype(lk) and is_numeric_dtype(rk): if lk.dtype.kind == rk.dtype.kind: - pass + continue # check whether ints and floats elif is_integer_dtype(rk) and is_float_dtype(lk): @@ -934,6 +936,7 @@ def _maybe_coerce_merge_keys(self): 'columns where the float values ' 'are not equal to their int ' 'representation', UserWarning) + continue elif is_float_dtype(rk) and is_integer_dtype(lk): if not (rk == rk.astype(lk.dtype))[~np.isnan(rk)].all(): @@ -941,22 +944,41 @@ def _maybe_coerce_merge_keys(self): 'columns where the float values ' 'are not equal to their int ' 'representation', UserWarning) + continue # let's infer and see if we are ok elif lib.infer_dtype(lk) == lib.infer_dtype(rk): - pass + continue # Check if we are trying to merge on obviously # incompatible dtypes GH 9780, GH 15800 - # boolean values are considered as numeric, but are still allowed - # to be merged on object boolean values - elif ((is_numeric_dtype(lk) and not is_bool_dtype(lk)) - and not is_numeric_dtype(rk)): - raise ValueError(msg) - elif (not is_numeric_dtype(lk) - and (is_numeric_dtype(rk) and not is_bool_dtype(rk))): - raise ValueError(msg) + # bool values are coerced to object + elif ((lk_is_object and is_bool_dtype(rk)) or + (is_bool_dtype(lk) and rk_is_object)): + pass + + # object values are allowed to be merged + elif ((lk_is_object and is_numeric_dtype(rk)) or + (is_numeric_dtype(lk) and rk_is_object)): + inferred_left = lib.infer_dtype(lk) + inferred_right = lib.infer_dtype(rk) + bool_types = ['integer', 'mixed-integer', 'boolean', 'empty'] + string_types = ['string', 'unicode', 'mixed', 'bytes', 'empty'] + + # inferred bool + if (inferred_left in bool_types and + inferred_right in bool_types): + pass + + # unless we are merging non-string-like with string-like + elif ((inferred_left in string_types and + inferred_right not in string_types) or + (inferred_right in string_types and + inferred_left not in string_types)): + raise ValueError(msg) + + # datetimelikes must match exactly elif is_datetimelike(lk) and not is_datetimelike(rk): raise ValueError(msg) elif not is_datetimelike(lk) and is_datetimelike(rk): @@ -966,6 +988,9 @@ def _maybe_coerce_merge_keys(self): elif not is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk): raise ValueError(msg) + elif lk_is_object and rk_is_object: + continue + # Houston, we have a problem! # let's coerce to object if the dtypes aren't # categorical, otherwise coerce to the category @@ -973,15 +998,14 @@ def _maybe_coerce_merge_keys(self): # then we would lose type information on some # columns, and end up trying to merge # incompatible dtypes. See GH 16900. - else: - if name in self.left.columns: - typ = lk.categories.dtype if lk_is_cat else object - self.left = self.left.assign( - **{name: self.left[name].astype(typ)}) - if name in self.right.columns: - typ = rk.categories.dtype if rk_is_cat else object - self.right = self.right.assign( - **{name: self.right[name].astype(typ)}) + if name in self.left.columns: + typ = lk.categories.dtype if lk_is_cat else object + self.left = self.left.assign( + **{name: self.left[name].astype(typ)}) + if name in self.right.columns: + typ = rk.categories.dtype if rk_is_cat else object + self.right = self.right.assign( + **{name: self.right[name].astype(typ)}) def _validate_specification(self): # Hm, any way to make this logic less complicated?? diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 970802e94662a..7839d93f5af88 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -924,10 +924,6 @@ class TestMergeDtypes(object): @pytest.mark.parametrize('right_vals', [ ['foo', 'bar'], Series(['foo', 'bar']).astype('category'), - [1, 2], - [1.0, 2.0], - Series([1, 2], dtype='uint64'), - Series([1, 2], dtype='int32') ]) def test_different(self, right_vals): @@ -942,22 +938,8 @@ def test_different(self, right_vals): # GH 9780 # We allow merging on object and categorical cols and cast # categorical cols to object - if (is_categorical_dtype(right['A'].dtype) or - is_object_dtype(right['A'].dtype)): - result = pd.merge(left, right, on='A') - assert is_object_dtype(result.A.dtype) - - # GH 9780 - # We raise for merging on object col and int/float col and - # merging on categorical col and int/float col - else: - msg = ("You are trying to merge on " - "{lk_dtype} and {rk_dtype} columns. " - "If you wish to proceed you should use " - "pd.concat".format(lk_dtype=left['A'].dtype, - rk_dtype=right['A'].dtype)) - with pytest.raises(ValueError, match=msg): - pd.merge(left, right, on='A') + result = pd.merge(left, right, on='A') + assert is_object_dtype(result.A.dtype) @pytest.mark.parametrize('d1', [np.int64, np.int32, np.int16, np.int8, np.uint8]) @@ -1056,6 +1038,33 @@ def test_merge_incompat_infer_boolean_object(self): assert_frame_equal(result, expected) @pytest.mark.parametrize('df1_vals, df2_vals', [ + + # merge on category coerces to object + ([0, 1, 2], Series(['a', 'b', 'a']).astype('category')), + ([0.0, 1.0, 2.0], Series(['a', 'b', 'a']).astype('category')), + + # no not infer + ([0, 1], pd.Series([False, True], dtype=object)), + ([0, 1], pd.Series([False, True], dtype=bool)), + ]) + def test_merge_incompat_dtypes_are_ok(self, df1_vals, df2_vals): + # these are explicity allowed incompat merges, that pass thru + # the result type is dependent on if the values on the rhs are + # inferred, otherwise these will be coereced to object + + df1 = DataFrame({'A': df1_vals}) + df2 = DataFrame({'A': df2_vals}) + + result = pd.merge(df1, df2, on=['A']) + assert is_object_dtype(result.A.dtype) + result = pd.merge(df2, df1, on=['A']) + assert is_object_dtype(result.A.dtype) + + @pytest.mark.parametrize('df1_vals, df2_vals', [ + # do not infer to numeric + + (Series([1, 2], dtype='uint64'), ["a", "b", "c"]), + (Series([1, 2], dtype='int32'), ["a", "b", "c"]), ([0, 1, 2], ["0", "1", "2"]), ([0.0, 1.0, 2.0], ["0", "1", "2"]), ([0, 1, 2], [u"0", u"1", u"2"]), @@ -1065,12 +1074,8 @@ def test_merge_incompat_infer_boolean_object(self): (pd.date_range('1/1/2011', periods=2, freq='D'), [0.0, 1.0]), (pd.date_range('20130101', periods=3), pd.date_range('20130101', periods=3, tz='US/Eastern')), - ([0, 1, 2], Series(['a', 'b', 'a']).astype('category')), - ([0.0, 1.0, 2.0], Series(['a', 'b', 'a']).astype('category')), - # TODO ([0, 1], pd.Series([False, True], dtype=bool)), - ([0, 1], pd.Series([False, True], dtype=object)) ]) - def test_merge_incompat_dtypes(self, df1_vals, df2_vals): + def test_merge_incompat_dtypes_error(self, df1_vals, df2_vals): # GH 9780, GH 15800 # Raise a ValueError when a user tries to merge on # dtypes that are incompatible (e.g., obj and int/float)