From 08da2d7104a0ccca1f016508b72f39d8e6405301 Mon Sep 17 00:00:00 2001 From: Licht-T Date: Sat, 5 May 2018 23:12:06 +0900 Subject: [PATCH 1/4] BUG: Fix combine_first converts other columns type into floats unexpectedly --- pandas/core/frame.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f4b7ccb0fdf5b..f7017d2a119ec 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5072,14 +5072,24 @@ def combine(self, other, func, fill_value=None, overwrite=True): series[this_mask] = fill_value otherSeries[other_mask] = fill_value - # if we have different dtypes, possibly promote - new_dtype = this_dtype - if not is_dtype_equal(this_dtype, other_dtype): - new_dtype = find_common_type([this_dtype, other_dtype]) - if not is_dtype_equal(this_dtype, new_dtype): + if col not in self.columns: + # If self DataFrame does not have col in other DataFrame, + # try to promote series, which is all NaN, as other_dtype. + new_dtype = other_dtype + try: series = series.astype(new_dtype) - if not is_dtype_equal(other_dtype, new_dtype): - otherSeries = otherSeries.astype(new_dtype) + except ValueError: + # e.g. new_dtype is integer types + pass + else: + # if we have different dtypes, possibly promote + new_dtype = this_dtype + if not is_dtype_equal(this_dtype, other_dtype): + new_dtype = find_common_type([this_dtype, other_dtype]) + if not is_dtype_equal(this_dtype, new_dtype): + series = series.astype(new_dtype) + if not is_dtype_equal(other_dtype, new_dtype): + otherSeries = otherSeries.astype(new_dtype) # see if we need to be represented as i8 (datetimelike) # try to keep us at this dtype @@ -5153,6 +5163,11 @@ def combiner(x, y, needs_i8_conversion=False): else: mask = isna(x_values) + # If the column y in other DataFrame is not in first DataFrame, + # just return y_values. + if y.name not in self.columns: + return y_values + return expressions.where(mask, y_values, x_values) return self.combine(other, combiner, overwrite=False) From 506b8e50ae3de12fc154eee3075b1af01d30ef50 Mon Sep 17 00:00:00 2001 From: Licht-T Date: Sun, 6 May 2018 15:47:56 +0900 Subject: [PATCH 2/4] TST: Add test of combine_first with asymmetric other --- pandas/tests/frame/test_combine_concat.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index 15ca65395e4fc..418e6513cc90c 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -750,6 +750,16 @@ def test_combine_first_int(self): tm.assert_frame_equal(res, df1) assert res['a'].dtype == 'int64' + def test_combine_first_with_asymmetric_other(self): + # GH20699 + df1 = pd.DataFrame({'isInt': [1]}) + df2 = pd.DataFrame({'isBool': [True]}) + + res = df1.combine_first(df2) + exp = pd.DataFrame({'isBool': [True], 'isInt': [1]}) + + tm.assert_frame_equal(res, exp) + def test_concat_datetime_datetime64_frame(self): # #2624 rows = [] From 8ea1bdf9ceb986f2898fbab716f9b3c4d5d46840 Mon Sep 17 00:00:00 2001 From: Licht-T Date: Mon, 7 May 2018 23:17:19 +0900 Subject: [PATCH 3/4] CLN: Simplify the type promote conditions in `combine` --- pandas/core/frame.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f7017d2a119ec..31719b5c99fac 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5083,13 +5083,11 @@ def combine(self, other, func, fill_value=None, overwrite=True): pass else: # if we have different dtypes, possibly promote - new_dtype = this_dtype - if not is_dtype_equal(this_dtype, other_dtype): - new_dtype = find_common_type([this_dtype, other_dtype]) - if not is_dtype_equal(this_dtype, new_dtype): - series = series.astype(new_dtype) - if not is_dtype_equal(other_dtype, new_dtype): - otherSeries = otherSeries.astype(new_dtype) + new_dtype = find_common_type([this_dtype, other_dtype]) + if not is_dtype_equal(this_dtype, new_dtype): + series = series.astype(new_dtype) + if not is_dtype_equal(other_dtype, new_dtype): + otherSeries = otherSeries.astype(new_dtype) # see if we need to be represented as i8 (datetimelike) # try to keep us at this dtype From df89c71c941909519f86c707d787d01f06e36384 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sat, 29 Sep 2018 14:03:04 -0700 Subject: [PATCH 4/4] DOC: Add whatsnew note --- doc/source/whatsnew/v0.24.0.txt | 1 + pandas/core/frame.py | 2 +- pandas/tests/frame/test_combine_concat.py | 10 ++++++---- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 91575c311b409..83bd8ee9b3c74 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -867,3 +867,4 @@ Other - :meth:`DataFrame.nlargest` and :meth:`DataFrame.nsmallest` now returns the correct n values when keep != 'all' also when tied on the first columns (:issue:`22752`) - :meth:`~pandas.io.formats.style.Styler.bar` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` and setting clipping range with ``vmin`` and ``vmax`` (:issue:`21548` and :issue:`21526`). ``NaN`` values are also handled properly. - Logical operations ``&, |, ^`` between :class:`Series` and :class:`Index` will no longer raise ``ValueError`` (:issue:`22092`) +- Bug in :meth:`DataFrame.combine_first` in which column types were unexpectedly converted to float (:issue:`20699`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 31719b5c99fac..6b6d0e9be931d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5077,7 +5077,7 @@ def combine(self, other, func, fill_value=None, overwrite=True): # try to promote series, which is all NaN, as other_dtype. new_dtype = other_dtype try: - series = series.astype(new_dtype) + series = series.astype(new_dtype, copy=False) except ValueError: # e.g. new_dtype is integer types pass diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index 418e6513cc90c..d1f921bc5e894 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -4,6 +4,7 @@ from datetime import datetime +import pytest import numpy as np from numpy import nan @@ -750,13 +751,14 @@ def test_combine_first_int(self): tm.assert_frame_equal(res, df1) assert res['a'].dtype == 'int64' - def test_combine_first_with_asymmetric_other(self): - # GH20699 - df1 = pd.DataFrame({'isInt': [1]}) + @pytest.mark.parametrize("val", [1, 1.0]) + def test_combine_first_with_asymmetric_other(self, val): + # see gh-20699 + df1 = pd.DataFrame({'isNum': [val]}) df2 = pd.DataFrame({'isBool': [True]}) res = df1.combine_first(df2) - exp = pd.DataFrame({'isBool': [True], 'isInt': [1]}) + exp = pd.DataFrame({'isBool': [True], 'isNum': [val]}) tm.assert_frame_equal(res, exp)