From 0c1d126bfa316a985a6e38ddceb9c7a644561396 Mon Sep 17 00:00:00 2001 From: danielhrisca Date: Mon, 11 Jan 2021 10:05:06 +0200 Subject: [PATCH 01/13] ENH: add argument to preserve dtypes of common columns in combine_first --- pandas/core/frame.py | 20 ++++++++- .../tests/frame/methods/test_combine_first.py | 44 +++++++++++++++++++ 2 files changed, 62 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e65e9302dd4d5..2d87bc325b9bb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6425,7 +6425,7 @@ def combine( # convert_objects just in case return self._constructor(result, index=new_index, columns=new_columns) - def combine_first(self, other: DataFrame) -> DataFrame: + def combine_first(self, other: DataFrame, preserve_dtypes: bool = False) -> DataFrame: """ Update null elements with value in the same location in `other`. @@ -6438,6 +6438,11 @@ def combine_first(self, other: DataFrame) -> DataFrame: other : DataFrame Provided DataFrame to use to fill null values. + preserve_dtypes : bool, default False + try to preserve the column dtypes afetr combining + + .. versionadded:: 1.2.1 + Returns ------- DataFrame @@ -6482,7 +6487,18 @@ def combiner(x, y): return expressions.where(mask, y_values, x_values) - return self.combine(other, combiner, overwrite=False) + combined = self.combine(other, combiner, overwrite=False) + + if preserve_dtypes: + dtypes = { + col: find_common_type([self.dtypes[col], other.dtypes[col]]) + for col in self.columns.intersection(other.columns) + } + + if dtypes: + combined = combined.astype(dtypes) + + return combined def update( self, diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 934ad9eb8213a..75bd5ac193916 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -24,6 +24,12 @@ def test_combine_first_mixed(self): combined = f.combine_first(g) tm.assert_frame_equal(combined, exp) + exp = DataFrame( + {"A": list("abab"), "B": [0, 1, 0, 1]}, index=[0, 1, 5, 6] + ) + combined = f.combine_first(g, preserve_dtypes=True) + tm.assert_frame_equal(combined, exp) + def test_combine_first(self, float_frame): # disjoint head, tail = float_frame[:5], float_frame[5:] @@ -363,9 +369,16 @@ def test_combine_first_int(self): expected_12 = DataFrame({"a": [0, 1, 3, 5]}, dtype="float64") tm.assert_frame_equal(result_12, expected_12) + result_12 = df1.combine_first(df2, preserve_dtypes=True) + expected_12 = DataFrame({"a": [0, 1, 3, 5]}) + tm.assert_frame_equal(result_12, expected_12) + result_21 = df2.combine_first(df1) expected_21 = DataFrame({"a": [1, 4, 3, 5]}, dtype="float64") + tm.assert_frame_equal(result_21, expected_21) + result_21 = df2.combine_first(df1, preserve_dtypes=True) + expected_21 = DataFrame({"a": [1, 4, 3, 5]}) tm.assert_frame_equal(result_21, expected_21) @pytest.mark.parametrize("val", [1, 1.0]) @@ -439,3 +452,34 @@ def test_combine_first_with_nan_multiindex(): index=mi_expected, ) tm.assert_frame_equal(res, expected) + +def test_combine_preserve_dtypes(): + a = Series(["a", "b"], index=range(2)) + b = Series(range(2), index=range(2)) + f = DataFrame({"A": a, "B": b}) + + c = Series(["a", "b"], index=range(5, 7)) + b = Series(range(-1, 1), index=range(5, 7)) + g = DataFrame({"B": b, "C": c}) + + exp = DataFrame( + { + "A": ["a", "b", np.nan, np.nan], + "B": [0.0, 1.0, -1.0, 0.0], + "C": [np.nan, np.nan, "a", "b"] + }, + index=[0, 1, 5, 6] + ) + combined = f.combine_first(g) + tm.assert_frame_equal(combined, exp) + + exp = DataFrame( + { + "A": ["a", "b", np.nan, np.nan], + "B": [0, 1, -1, 0], + "C": [np.nan, np.nan, "a", "b"] + }, + index=[0, 1, 5, 6] + ) + combined = f.combine_first(g, preserve_dtypes=True) + tm.assert_frame_equal(combined, exp) From 1a5fe0feefab079b75f0f475bac09605a67b9e6d Mon Sep 17 00:00:00 2001 From: danielhrisca Date: Mon, 11 Jan 2021 10:37:47 +0200 Subject: [PATCH 02/13] fix black code style --- pandas/core/frame.py | 4 +++- pandas/tests/frame/methods/test_combine_first.py | 13 ++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2d87bc325b9bb..8fab19b652712 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6425,7 +6425,9 @@ def combine( # convert_objects just in case return self._constructor(result, index=new_index, columns=new_columns) - def combine_first(self, other: DataFrame, preserve_dtypes: bool = False) -> DataFrame: + def combine_first( + self, other: DataFrame, preserve_dtypes: bool = False + ) -> DataFrame: """ Update null elements with value in the same location in `other`. diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 75bd5ac193916..03786ad997f75 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -24,9 +24,7 @@ def test_combine_first_mixed(self): combined = f.combine_first(g) tm.assert_frame_equal(combined, exp) - exp = DataFrame( - {"A": list("abab"), "B": [0, 1, 0, 1]}, index=[0, 1, 5, 6] - ) + exp = DataFrame({"A": list("abab"), "B": [0, 1, 0, 1]}, index=[0, 1, 5, 6]) combined = f.combine_first(g, preserve_dtypes=True) tm.assert_frame_equal(combined, exp) @@ -453,6 +451,7 @@ def test_combine_first_with_nan_multiindex(): ) tm.assert_frame_equal(res, expected) + def test_combine_preserve_dtypes(): a = Series(["a", "b"], index=range(2)) b = Series(range(2), index=range(2)) @@ -466,9 +465,9 @@ def test_combine_preserve_dtypes(): { "A": ["a", "b", np.nan, np.nan], "B": [0.0, 1.0, -1.0, 0.0], - "C": [np.nan, np.nan, "a", "b"] + "C": [np.nan, np.nan, "a", "b"], }, - index=[0, 1, 5, 6] + index=[0, 1, 5, 6], ) combined = f.combine_first(g) tm.assert_frame_equal(combined, exp) @@ -477,9 +476,9 @@ def test_combine_preserve_dtypes(): { "A": ["a", "b", np.nan, np.nan], "B": [0, 1, -1, 0], - "C": [np.nan, np.nan, "a", "b"] + "C": [np.nan, np.nan, "a", "b"], }, - index=[0, 1, 5, 6] + index=[0, 1, 5, 6], ) combined = f.combine_first(g, preserve_dtypes=True) tm.assert_frame_equal(combined, exp) From 24f6ffcff8c750f76288921e48f113f56116d71c Mon Sep 17 00:00:00 2001 From: danielhrisca Date: Mon, 11 Jan 2021 10:44:44 +0200 Subject: [PATCH 03/13] fix misspelled word in docstring --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8fab19b652712..77220201ce8b3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6441,7 +6441,7 @@ def combine_first( Provided DataFrame to use to fill null values. preserve_dtypes : bool, default False - try to preserve the column dtypes afetr combining + try to preserve the column dtypes after combining .. versionadded:: 1.2.1 From d0f9ed38cac7e26a6dd9ba85aa762cc922a9a77d Mon Sep 17 00:00:00 2001 From: danielhrisca Date: Mon, 11 Jan 2021 20:24:06 +0200 Subject: [PATCH 04/13] update tests and remove preserve_dtypes argument from combine_first --- pandas/core/frame.py | 31 +++---- .../tests/frame/methods/test_combine_first.py | 83 ++++++++++--------- 2 files changed, 58 insertions(+), 56 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 77220201ce8b3..cdfc9da5d6218 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6425,9 +6425,7 @@ def combine( # convert_objects just in case return self._constructor(result, index=new_index, columns=new_columns) - def combine_first( - self, other: DataFrame, preserve_dtypes: bool = False - ) -> DataFrame: + def combine_first(self, other: DataFrame) -> DataFrame: """ Update null elements with value in the same location in `other`. @@ -6440,11 +6438,6 @@ def combine_first( other : DataFrame Provided DataFrame to use to fill null values. - preserve_dtypes : bool, default False - try to preserve the column dtypes after combining - - .. versionadded:: 1.2.1 - Returns ------- DataFrame @@ -6491,14 +6484,22 @@ def combiner(x, y): combined = self.combine(other, combiner, overwrite=False) - if preserve_dtypes: - dtypes = { - col: find_common_type([self.dtypes[col], other.dtypes[col]]) - for col in self.columns.intersection(other.columns) - } + dtypes = {} + + for col in self.columns.intersection(other.columns): + try: + if combined.dtypes[col] != self.dtypes[col]: + dtypes[col] = find_common_type( + [self.dtypes[col], other.dtypes[col]] + ) + except TypeError: + try: + combined[col] = combined[col].astype(self.dtypes[col]) + except: + pass - if dtypes: - combined = combined.astype(dtypes) + if dtypes: + combined = combined.astype(dtypes) return combined diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 03786ad997f75..4e24bdbcc6df0 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -6,6 +6,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series import pandas._testing as tm +from pandas.core.dtypes.cast import find_common_type class TestDataFrameCombineFirst: @@ -18,14 +19,8 @@ def test_combine_first_mixed(self): b = Series(range(2), index=range(5, 7)) g = DataFrame({"A": a, "B": b}) - exp = DataFrame( - {"A": list("abab"), "B": [0.0, 1.0, 0.0, 1.0]}, index=[0, 1, 5, 6] - ) - combined = f.combine_first(g) - tm.assert_frame_equal(combined, exp) - exp = DataFrame({"A": list("abab"), "B": [0, 1, 0, 1]}, index=[0, 1, 5, 6]) - combined = f.combine_first(g, preserve_dtypes=True) + combined = f.combine_first(g) tm.assert_frame_equal(combined, exp) def test_combine_first(self, float_frame): @@ -148,7 +143,7 @@ def test_combine_first_return_obj_type_with_bools(self): ) df2 = DataFrame([[-42.6, np.nan, True], [-5.0, 1.6, False]], index=[1, 2]) - expected = Series([True, True, False], name=2, dtype=object) + expected = Series([True, True, False], name=2, dtype=bool) result_12 = df1.combine_first(df2)[2] tm.assert_series_equal(result_12, expected) @@ -161,22 +156,22 @@ def test_combine_first_return_obj_type_with_bools(self): ( ( [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], - [None, None, None], + [pd.NaT, pd.NaT, pd.NaT], [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], ), ( - [None, None, None], + [pd.NaT, pd.NaT, pd.NaT], [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], ), ( - [datetime(2000, 1, 2), None, None], + [datetime(2000, 1, 2), pd.NaT, pd.NaT], [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], [datetime(2000, 1, 2), datetime(2000, 1, 2), datetime(2000, 1, 3)], ), ( [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], - [datetime(2000, 1, 2), None, None], + [datetime(2000, 1, 2), pd.NaT, pd.NaT], [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], ), ), @@ -200,13 +195,13 @@ def test_combine_first_align_nan(self): res = dfa.combine_first(dfb) exp = DataFrame( - {"a": [pd.Timestamp("2011-01-01"), pd.NaT], "b": [2.0, 5.0]}, + {"a": [pd.Timestamp("2011-01-01"), pd.NaT], "b": [2, 5]}, columns=["a", "b"], ) tm.assert_frame_equal(res, exp) assert res["a"].dtype == "datetime64[ns]" # ToDo: this must be int64 - assert res["b"].dtype == "float64" + assert res["b"].dtype == "int64" res = dfa.iloc[:0].combine_first(dfb) exp = DataFrame({"a": [np.nan, np.nan], "b": [4, 5]}, columns=["a", "b"]) @@ -223,14 +218,12 @@ def test_combine_first_timezone(self): columns=["UTCdatetime", "abc"], data=data1, index=pd.date_range("20140627", periods=1), - dtype="object", ) data2 = pd.to_datetime("20121212 12:12").tz_localize("UTC") df2 = DataFrame( columns=["UTCdatetime", "xyz"], data=data2, index=pd.date_range("20140628", periods=1), - dtype="object", ) res = df2[["UTCdatetime"]].combine_first(df1) exp = DataFrame( @@ -243,13 +236,10 @@ def test_combine_first_timezone(self): }, columns=["UTCdatetime", "abc"], index=pd.date_range("20140627", periods=2, freq="D"), - dtype="object", ) assert res["UTCdatetime"].dtype == "datetime64[ns, UTC]" assert res["abc"].dtype == "datetime64[ns, UTC]" - # Need to cast all to "obejct" because combine_first does not retain dtypes: - # GH Issue 7509 - res = res.astype("object") + tm.assert_frame_equal(res, exp) # see gh-10567 @@ -364,18 +354,10 @@ def test_combine_first_int(self): df2 = DataFrame({"a": [1, 4]}, dtype="int64") result_12 = df1.combine_first(df2) - expected_12 = DataFrame({"a": [0, 1, 3, 5]}, dtype="float64") - tm.assert_frame_equal(result_12, expected_12) - - result_12 = df1.combine_first(df2, preserve_dtypes=True) expected_12 = DataFrame({"a": [0, 1, 3, 5]}) tm.assert_frame_equal(result_12, expected_12) result_21 = df2.combine_first(df1) - expected_21 = DataFrame({"a": [1, 4, 3, 5]}, dtype="float64") - tm.assert_frame_equal(result_21, expected_21) - - result_21 = df2.combine_first(df1, preserve_dtypes=True) expected_21 = DataFrame({"a": [1, 4, 3, 5]}) tm.assert_frame_equal(result_21, expected_21) @@ -415,11 +397,41 @@ def test_combine_first_string_dtype_only_na(self): def test_combine_first_timestamp_bug(scalar1, scalar2, nulls_fixture): # GH28481 na_value = nulls_fixture + frame = DataFrame([[na_value, na_value]], columns=["a", "b"]) other = DataFrame([[scalar1, scalar2]], columns=["b", "c"]) + try: + common_dtype = find_common_type([frame.dtypes["b"], other.dtypes["b"]]) + except TypeError: + common_dtype = "object" + + if common_dtype == "object" or frame.dtypes["b"] == other.dtypes["b"]: + val = scalar1 + else: + val = na_value + result = frame.combine_first(other) - expected = DataFrame([[na_value, scalar1, scalar2]], columns=["a", "b", "c"]) + + expected = DataFrame([[na_value, val, scalar2]], columns=["a", "b", "c"]) + + expected["b"] = expected["b"].astype(common_dtype) + + tm.assert_frame_equal(result, expected) + + +def test_combine_first_timestamp_bug_NaT(): + # GH28481 + frame = DataFrame([[pd.NaT, pd.NaT]], columns=["a", "b"]) + other = DataFrame( + [[datetime(2020, 1, 1), datetime(2020, 1, 2)]], columns=["b", "c"] + ) + + result = frame.combine_first(other) + expected = DataFrame( + [[pd.NaT, datetime(2020, 1, 1), datetime(2020, 1, 2)]], columns=["a", "b", "c"] + ) + tm.assert_frame_equal(result, expected) @@ -461,17 +473,6 @@ def test_combine_preserve_dtypes(): b = Series(range(-1, 1), index=range(5, 7)) g = DataFrame({"B": b, "C": c}) - exp = DataFrame( - { - "A": ["a", "b", np.nan, np.nan], - "B": [0.0, 1.0, -1.0, 0.0], - "C": [np.nan, np.nan, "a", "b"], - }, - index=[0, 1, 5, 6], - ) - combined = f.combine_first(g) - tm.assert_frame_equal(combined, exp) - exp = DataFrame( { "A": ["a", "b", np.nan, np.nan], @@ -480,5 +481,5 @@ def test_combine_preserve_dtypes(): }, index=[0, 1, 5, 6], ) - combined = f.combine_first(g, preserve_dtypes=True) + combined = f.combine_first(g) tm.assert_frame_equal(combined, exp) From 198eaa4ad3b23810dbb5a0f78934e1365c66a983 Mon Sep 17 00:00:00 2001 From: danielhrisca Date: Mon, 11 Jan 2021 20:33:51 +0200 Subject: [PATCH 05/13] fix isort and flake8 errors --- pandas/core/frame.py | 8 +++++++- pandas/tests/frame/methods/test_combine_first.py | 3 ++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cdfc9da5d6218..eb675e6e29896 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6488,14 +6488,20 @@ def combiner(x, y): for col in self.columns.intersection(other.columns): try: + # if the column has different dtype in the + # DataFrame objects then add the common dtype + # to the columns dtype conversion dict if combined.dtypes[col] != self.dtypes[col]: dtypes[col] = find_common_type( [self.dtypes[col], other.dtypes[col]] ) except TypeError: + # numpy dtype was compared with pandas dtype try: + # just try to apply the initial column dtype combined[col] = combined[col].astype(self.dtypes[col]) - except: + except ValueError: + # could not apply the initial dtype, so skip pass if dtypes: diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 4e24bdbcc6df0..350caf7776289 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -3,10 +3,11 @@ import numpy as np import pytest +from pandas.core.dtypes.cast import find_common_type + import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series import pandas._testing as tm -from pandas.core.dtypes.cast import find_common_type class TestDataFrameCombineFirst: From f209590de4cdf766c16ef57a3f44084d48c10c41 Mon Sep 17 00:00:00 2001 From: danielhrisca Date: Tue, 12 Jan 2021 09:04:51 +0200 Subject: [PATCH 06/13] updates according to erview --- pandas/core/frame.py | 25 +++++-------------- .../tests/frame/methods/test_combine_first.py | 10 +++----- 2 files changed, 10 insertions(+), 25 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index eb675e6e29896..6cc70f96e9d0f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6484,25 +6484,12 @@ def combiner(x, y): combined = self.combine(other, combiner, overwrite=False) - dtypes = {} - - for col in self.columns.intersection(other.columns): - try: - # if the column has different dtype in the - # DataFrame objects then add the common dtype - # to the columns dtype conversion dict - if combined.dtypes[col] != self.dtypes[col]: - dtypes[col] = find_common_type( - [self.dtypes[col], other.dtypes[col]] - ) - except TypeError: - # numpy dtype was compared with pandas dtype - try: - # just try to apply the initial column dtype - combined[col] = combined[col].astype(self.dtypes[col]) - except ValueError: - # could not apply the initial dtype, so skip - pass + dtypes = { + col: find_common_type([self.dtypes[col], other.dtypes[col]]) + for col in self.columns.intersection(other.columns) + if not is_dtype_equal(combined.dtypes[col], self.dtypes[col]) + and not is_dtype_equal(combined.dtypes[col], find_common_type([self.dtypes[col], other.dtypes[col]])) + } if dtypes: combined = combined.astype(dtypes) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 350caf7776289..e3a1d5792d24d 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas.core.dtypes.cast import find_common_type +from pandas.core.dtypes.cast import find_common_type, is_dtype_equal import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series @@ -402,12 +402,9 @@ def test_combine_first_timestamp_bug(scalar1, scalar2, nulls_fixture): frame = DataFrame([[na_value, na_value]], columns=["a", "b"]) other = DataFrame([[scalar1, scalar2]], columns=["b", "c"]) - try: - common_dtype = find_common_type([frame.dtypes["b"], other.dtypes["b"]]) - except TypeError: - common_dtype = "object" + common_dtype = find_common_type([frame.dtypes["b"], other.dtypes["b"]]) - if common_dtype == "object" or frame.dtypes["b"] == other.dtypes["b"]: + if is_dtype_equal(common_dtype, "object") or frame.dtypes["b"] == other.dtypes["b"]: val = scalar1 else: val = na_value @@ -466,6 +463,7 @@ def test_combine_first_with_nan_multiindex(): def test_combine_preserve_dtypes(): + # GH7509 a = Series(["a", "b"], index=range(2)) b = Series(range(2), index=range(2)) f = DataFrame({"A": a, "B": b}) From 5e252d081146ddab89a88c612d7e9184288294ae Mon Sep 17 00:00:00 2001 From: danielhrisca Date: Tue, 12 Jan 2021 09:49:24 +0200 Subject: [PATCH 07/13] update whatsnew with example code --- doc/source/whatsnew/v1.3.0.rst | 35 ++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index cb97fdeccd579..1853f25da0e34 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -65,6 +65,41 @@ Notable bug fixes These are bug fixes that might have notable behavior changes. +Preserve dtypes in :meth:`~pandas.DataFrame.combine_first` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:meth:`~pandas.DataFrame.combine_first` will now preserve dtypes (:issue:`7509`) + +*pandas 1.2.x* + +.. code-block:: ipython + + In [1]: df1 = pd.DataFrame({"A": [1, 2, 3], "B": [1, 2, 3]}, index=[0, 1, 2]) + df2 = pd.DataFrame({"B": [4, 5, 6], "C": [1, 2, 3]}, index=[2, 3, 4]) + combined = df1.combine_first(df2) + (combined, "---------------", combined.dtypes) + Out[2]: + ( A B C + 0 1.0 1.0 NaN + 1 2.0 2.0 NaN + 2 3.0 3.0 1.0 + 3 NaN 5.0 2.0 + 4 NaN 6.0 3.0, + '---------------', + A float64 + B float64 + C float64 + dtype: object) + +*pandas 1.3.0* + +.. ipython:: python + + df1 = pd.DataFrame({"A": [1, 2, 3], "B": [1, 2, 3]}, index=[0, 1, 2]) + df2 = pd.DataFrame({"B": [4, 5, 6], "C": [1, 2, 3]}, index=[2, 3, 4]) + combined = df1.combine_first(df2) + (combined, "---------------", combined.dtypes) + .. _whatsnew_130.api_breaking.deps: From 7c67e3cfb44b39da619235d1c04d8ca9aa1f1e37 Mon Sep 17 00:00:00 2001 From: danielhrisca Date: Tue, 12 Jan 2021 09:51:12 +0200 Subject: [PATCH 08/13] wrong header in documentation --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 1853f25da0e34..7c53c567646ec 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -66,7 +66,7 @@ These are bug fixes that might have notable behavior changes. Preserve dtypes in :meth:`~pandas.DataFrame.combine_first` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :meth:`~pandas.DataFrame.combine_first` will now preserve dtypes (:issue:`7509`) From 47d091195d45501ea8643712f0391c6efa66c330 Mon Sep 17 00:00:00 2001 From: danielhrisca Date: Tue, 12 Jan 2021 09:53:17 +0200 Subject: [PATCH 09/13] fix black code style --- pandas/core/frame.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6cc70f96e9d0f..7db27c9c29ec8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6488,7 +6488,10 @@ def combiner(x, y): col: find_common_type([self.dtypes[col], other.dtypes[col]]) for col in self.columns.intersection(other.columns) if not is_dtype_equal(combined.dtypes[col], self.dtypes[col]) - and not is_dtype_equal(combined.dtypes[col], find_common_type([self.dtypes[col], other.dtypes[col]])) + and not is_dtype_equal( + combined.dtypes[col], + find_common_type([self.dtypes[col], other.dtypes[col]]), + ) } if dtypes: From 1b5691cc7143b95e7a765471e1d136fdca66b566 Mon Sep 17 00:00:00 2001 From: danielhrisca Date: Wed, 13 Jan 2021 08:02:50 +0200 Subject: [PATCH 10/13] update whatsnew with example code as requested in the review --- doc/source/whatsnew/v1.3.0.rst | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 7c53c567646ec..eba01f609ef0c 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -70,14 +70,17 @@ Preserve dtypes in :meth:`~pandas.DataFrame.combine_first` :meth:`~pandas.DataFrame.combine_first` will now preserve dtypes (:issue:`7509`) +.. ipython:: python + + df1 = pd.DataFrame({"A": [1, 2, 3], "B": [1, 2, 3]}, index=[0, 1, 2]) + df2 = pd.DataFrame({"B": [4, 5, 6], "C": [1, 2, 3]}, index=[2, 3, 4]) + combined = df1.combine_first(df2) + *pandas 1.2.x* .. code-block:: ipython - In [1]: df1 = pd.DataFrame({"A": [1, 2, 3], "B": [1, 2, 3]}, index=[0, 1, 2]) - df2 = pd.DataFrame({"B": [4, 5, 6], "C": [1, 2, 3]}, index=[2, 3, 4]) - combined = df1.combine_first(df2) - (combined, "---------------", combined.dtypes) + In [1]: (combined, "---------------", combined.dtypes) Out[2]: ( A B C 0 1.0 1.0 NaN @@ -95,10 +98,19 @@ Preserve dtypes in :meth:`~pandas.DataFrame.combine_first` .. ipython:: python - df1 = pd.DataFrame({"A": [1, 2, 3], "B": [1, 2, 3]}, index=[0, 1, 2]) - df2 = pd.DataFrame({"B": [4, 5, 6], "C": [1, 2, 3]}, index=[2, 3, 4]) - combined = df1.combine_first(df2) - (combined, "---------------", combined.dtypes) + In [1]: (combined, "---------------", combined.dtypes) + Out[2]: + ( A B C + 0 1.0 1 NaN + 1 2.0 2 NaN + 2 3.0 3 1.0 + 3 NaN 5 2.0 + 4 NaN 6 3.0, + '---------------', + A float64 + B int64 + C float64 + dtype: object) .. _whatsnew_130.api_breaking.deps: From ba49f9cc3a3ae8c9947fb4eae696022f96378cfd Mon Sep 17 00:00:00 2001 From: danielhrisca Date: Wed, 13 Jan 2021 15:54:16 +0200 Subject: [PATCH 11/13] remove redundant check --- pandas/core/frame.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7db27c9c29ec8..897ff7e12cc9e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6488,10 +6488,6 @@ def combiner(x, y): col: find_common_type([self.dtypes[col], other.dtypes[col]]) for col in self.columns.intersection(other.columns) if not is_dtype_equal(combined.dtypes[col], self.dtypes[col]) - and not is_dtype_equal( - combined.dtypes[col], - find_common_type([self.dtypes[col], other.dtypes[col]]), - ) } if dtypes: From a2d4e383c6d9b6541bda10f65921b6a9595c94b6 Mon Sep 17 00:00:00 2001 From: danielhrisca Date: Wed, 13 Jan 2021 16:40:06 +0200 Subject: [PATCH 12/13] further fix and polish the whatsnew entry --- doc/source/whatsnew/v1.3.0.rst | 27 +++++---------------------- 1 file changed, 5 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index eba01f609ef0c..557663d10accf 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -73,44 +73,27 @@ Preserve dtypes in :meth:`~pandas.DataFrame.combine_first` .. ipython:: python df1 = pd.DataFrame({"A": [1, 2, 3], "B": [1, 2, 3]}, index=[0, 1, 2]) + df1 df2 = pd.DataFrame({"B": [4, 5, 6], "C": [1, 2, 3]}, index=[2, 3, 4]) + df2 combined = df1.combine_first(df2) *pandas 1.2.x* .. code-block:: ipython - In [1]: (combined, "---------------", combined.dtypes) + In [1]: combined.dtypes Out[2]: - ( A B C - 0 1.0 1.0 NaN - 1 2.0 2.0 NaN - 2 3.0 3.0 1.0 - 3 NaN 5.0 2.0 - 4 NaN 6.0 3.0, - '---------------', A float64 B float64 C float64 - dtype: object) + dtype: object *pandas 1.3.0* .. ipython:: python - In [1]: (combined, "---------------", combined.dtypes) - Out[2]: - ( A B C - 0 1.0 1 NaN - 1 2.0 2 NaN - 2 3.0 3 1.0 - 3 NaN 5 2.0 - 4 NaN 6 3.0, - '---------------', - A float64 - B int64 - C float64 - dtype: object) + combined.dtypes .. _whatsnew_130.api_breaking.deps: From f9379289ac6daf75ade5df90d5e81a0593f0745a Mon Sep 17 00:00:00 2001 From: danielhrisca Date: Fri, 15 Jan 2021 16:43:18 +0200 Subject: [PATCH 13/13] fix single letter variable names --- .../tests/frame/methods/test_combine_first.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index e3a1d5792d24d..1325bfbda24c6 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -464,15 +464,15 @@ def test_combine_first_with_nan_multiindex(): def test_combine_preserve_dtypes(): # GH7509 - a = Series(["a", "b"], index=range(2)) - b = Series(range(2), index=range(2)) - f = DataFrame({"A": a, "B": b}) + a_column = Series(["a", "b"], index=range(2)) + b_column = Series(range(2), index=range(2)) + df1 = DataFrame({"A": a_column, "B": b_column}) - c = Series(["a", "b"], index=range(5, 7)) - b = Series(range(-1, 1), index=range(5, 7)) - g = DataFrame({"B": b, "C": c}) + c_column = Series(["a", "b"], index=range(5, 7)) + b_column = Series(range(-1, 1), index=range(5, 7)) + df2 = DataFrame({"B": b_column, "C": c_column}) - exp = DataFrame( + expected = DataFrame( { "A": ["a", "b", np.nan, np.nan], "B": [0, 1, -1, 0], @@ -480,5 +480,5 @@ def test_combine_preserve_dtypes(): }, index=[0, 1, 5, 6], ) - combined = f.combine_first(g) - tm.assert_frame_equal(combined, exp) + combined = df1.combine_first(df2) + tm.assert_frame_equal(combined, expected)