diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 91575c311b409..83bd8ee9b3c74 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -867,3 +867,4 @@ Other - :meth:`DataFrame.nlargest` and :meth:`DataFrame.nsmallest` now returns the correct n values when keep != 'all' also when tied on the first columns (:issue:`22752`) - :meth:`~pandas.io.formats.style.Styler.bar` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` and setting clipping range with ``vmin`` and ``vmax`` (:issue:`21548` and :issue:`21526`). ``NaN`` values are also handled properly. - Logical operations ``&, |, ^`` between :class:`Series` and :class:`Index` will no longer raise ``ValueError`` (:issue:`22092`) +- Bug in :meth:`DataFrame.combine_first` in which column types were unexpectedly converted to float (:issue:`20699`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f4b7ccb0fdf5b..6b6d0e9be931d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5072,9 +5072,17 @@ def combine(self, other, func, fill_value=None, overwrite=True): series[this_mask] = fill_value otherSeries[other_mask] = fill_value - # if we have different dtypes, possibly promote - new_dtype = this_dtype - if not is_dtype_equal(this_dtype, other_dtype): + if col not in self.columns: + # If self DataFrame does not have col in other DataFrame, + # try to promote series, which is all NaN, as other_dtype. + new_dtype = other_dtype + try: + series = series.astype(new_dtype, copy=False) + except ValueError: + # e.g. new_dtype is integer types + pass + else: + # if we have different dtypes, possibly promote new_dtype = find_common_type([this_dtype, other_dtype]) if not is_dtype_equal(this_dtype, new_dtype): series = series.astype(new_dtype) @@ -5153,6 +5161,11 @@ def combiner(x, y, needs_i8_conversion=False): else: mask = isna(x_values) + # If the column y in other DataFrame is not in first DataFrame, + # just return y_values. + if y.name not in self.columns: + return y_values + return expressions.where(mask, y_values, x_values) return self.combine(other, combiner, overwrite=False) diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index 15ca65395e4fc..d1f921bc5e894 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -4,6 +4,7 @@ from datetime import datetime +import pytest import numpy as np from numpy import nan @@ -750,6 +751,17 @@ def test_combine_first_int(self): tm.assert_frame_equal(res, df1) assert res['a'].dtype == 'int64' + @pytest.mark.parametrize("val", [1, 1.0]) + def test_combine_first_with_asymmetric_other(self, val): + # see gh-20699 + df1 = pd.DataFrame({'isNum': [val]}) + df2 = pd.DataFrame({'isBool': [True]}) + + res = df1.combine_first(df2) + exp = pd.DataFrame({'isBool': [True], 'isNum': [val]}) + + tm.assert_frame_equal(res, exp) + def test_concat_datetime_datetime64_frame(self): # #2624 rows = []