diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 4a4d60b4dfbb2..1a6327554f61a 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -51,7 +51,7 @@ Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - :func:`Series.fillna` now raises a ``TypeError`` instead of a ``ValueError`` when passed a list, tuple or DataFrame as a ``value`` (:issue:`18293`) -- +- :func:`pandas.DataFrame.merge` no longer casts a ``float`` column to ``object`` when merging on ``int`` and ``float`` columns (:issue:`16572`) - diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 412c00dc95ec0..d00aa1003988a 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -906,16 +906,31 @@ def _maybe_coerce_merge_keys(self): continue # if we are numeric, then allow differing - # kinds to proceed, eg. int64 and int8 + # kinds to proceed, eg. int64 and int8, int and float # further if we are object, but we infer to # the same, then proceed if is_numeric_dtype(lk) and is_numeric_dtype(rk): if lk.dtype.kind == rk.dtype.kind: - continue + pass + + # check whether ints and floats + elif is_integer_dtype(rk) and is_float_dtype(lk): + if not (lk == lk.astype(rk.dtype)).all(): + warnings.warn('You are merging on int and float ' + 'columns where the float values ' + 'are not equal to their int ' + 'representation', UserWarning) + + elif is_float_dtype(rk) and is_integer_dtype(lk): + if not (rk == rk.astype(lk.dtype)).all(): + warnings.warn('You are merging on int and float ' + 'columns where the float values ' + 'are not equal to their int ' + 'representation', UserWarning) # let's infer and see if we are ok - if lib.infer_dtype(lk) == lib.infer_dtype(rk): - continue + elif lib.infer_dtype(lk) == lib.infer_dtype(rk): + pass # Houston, we have a problem! # let's coerce to object if the dtypes aren't @@ -924,14 +939,15 @@ def _maybe_coerce_merge_keys(self): # then we would lose type information on some # columns, and end up trying to merge # incompatible dtypes. See GH 16900. - if name in self.left.columns: - typ = lk.categories.dtype if lk_is_cat else object - self.left = self.left.assign( - **{name: self.left[name].astype(typ)}) - if name in self.right.columns: - typ = rk.categories.dtype if rk_is_cat else object - self.right = self.right.assign( - **{name: self.right[name].astype(typ)}) + else: + if name in self.left.columns: + typ = lk.categories.dtype if lk_is_cat else object + self.left = self.left.assign( + **{name: self.left[name].astype(typ)}) + if name in self.right.columns: + typ = rk.categories.dtype if rk_is_cat else object + self.right = self.right.assign( + **{name: self.right[name].astype(typ)}) def _validate_specification(self): # Hm, any way to make this logic less complicated?? diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py index 172667c9a0fb8..ee7c4e5c90bb8 100644 --- a/pandas/tests/reshape/test_merge.py +++ b/pandas/tests/reshape/test_merge.py @@ -13,7 +13,10 @@ from pandas.core.reshape.merge import merge, MergeError from pandas.util.testing import assert_frame_equal, assert_series_equal from pandas.core.dtypes.dtypes import CategoricalDtype -from pandas.core.dtypes.common import is_categorical_dtype, is_object_dtype +from pandas.core.dtypes.common import ( + is_categorical_dtype, + is_object_dtype, +) from pandas import DataFrame, Index, MultiIndex, Series, Categorical import pandas.util.testing as tm from pandas.api.types import CategoricalDtype as CDT @@ -1408,6 +1411,42 @@ def test_join_multi_dtypes(self, d1, d2): expected.sort_values(['k1', 'k2'], kind='mergesort', inplace=True) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize('int_vals, float_vals, exp_vals', [ + ([1, 2, 3], [1.0, 2.0, 3.0], {'X': [1, 2, 3], 'Y': [1.0, 2.0, 3.0]}), + ([1, 2, 3], [1.0, 3.0], {'X': [1, 3], 'Y': [1.0, 3.0]}), + ([1, 2], [1.0, 2.0, 3.0], {'X': [1, 2], 'Y': [1.0, 2.0]}), + ]) + def test_merge_on_ints_floats(self, int_vals, float_vals, exp_vals): + # GH 16572 + # Check that float column is not cast to object if + # merging on float and int columns + A = DataFrame({'X': int_vals}) + B = DataFrame({'Y': float_vals}) + expected = DataFrame(exp_vals) + + result = A.merge(B, left_on='X', right_on='Y') + assert_frame_equal(result, expected) + + result = B.merge(A, left_on='Y', right_on='X') + assert_frame_equal(result, expected[['Y', 'X']]) + + def test_merge_on_ints_floats_warning(self): + # GH 16572 + # merge will produce a warning when merging on int and + # float columns where the float values are not exactly + # equal to their int representation + A = DataFrame({'X': [1, 2, 3]}) + B = DataFrame({'Y': [1.1, 2.5, 3.0]}) + expected = DataFrame({'X': [3], 'Y': [3.0]}) + + with tm.assert_produces_warning(UserWarning): + result = A.merge(B, left_on='X', right_on='Y') + assert_frame_equal(result, expected) + + with tm.assert_produces_warning(UserWarning): + result = B.merge(A, left_on='Y', right_on='X') + assert_frame_equal(result, expected[['Y', 'X']]) + @pytest.fixture def left():