Skip to content

Commit 96188d6

Browse files
danielhriscaluckyvs1
authored andcommitted
ENH: try to preserve the dtype on combine_first for the case where the two DataFrame objects have the same columns (pandas-dev#39051)
1 parent 6ccefc9 commit 96188d6

File tree

3 files changed

+105
-21
lines changed

3 files changed

+105
-21
lines changed

doc/source/whatsnew/v1.3.0.rst

+30
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,36 @@ Notable bug fixes
6363
These are bug fixes that might have notable behavior changes.
6464

6565

66+
Preserve dtypes in :meth:`~pandas.DataFrame.combine_first`
67+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
68+
69+
:meth:`~pandas.DataFrame.combine_first` will now preserve dtypes (:issue:`7509`)
70+
71+
.. ipython:: python
72+
73+
df1 = pd.DataFrame({"A": [1, 2, 3], "B": [1, 2, 3]}, index=[0, 1, 2])
74+
df1
75+
df2 = pd.DataFrame({"B": [4, 5, 6], "C": [1, 2, 3]}, index=[2, 3, 4])
76+
df2
77+
combined = df1.combine_first(df2)
78+
79+
*pandas 1.2.x*
80+
81+
.. code-block:: ipython
82+
83+
In [1]: combined.dtypes
84+
Out[2]:
85+
A float64
86+
B float64
87+
C float64
88+
dtype: object
89+
90+
*pandas 1.3.0*
91+
92+
.. ipython:: python
93+
94+
combined.dtypes
95+
6696
6797
.. _whatsnew_130.api_breaking.deps:
6898

pandas/core/frame.py

+12-1
Original file line numberDiff line numberDiff line change
@@ -6513,7 +6513,18 @@ def combiner(x, y):
65136513

65146514
return expressions.where(mask, y_values, x_values)
65156515

6516-
return self.combine(other, combiner, overwrite=False)
6516+
combined = self.combine(other, combiner, overwrite=False)
6517+
6518+
dtypes = {
6519+
col: find_common_type([self.dtypes[col], other.dtypes[col]])
6520+
for col in self.columns.intersection(other.columns)
6521+
if not is_dtype_equal(combined.dtypes[col], self.dtypes[col])
6522+
}
6523+
6524+
if dtypes:
6525+
combined = combined.astype(dtypes)
6526+
6527+
return combined
65176528

65186529
def update(
65196530
self,

pandas/tests/frame/methods/test_combine_first.py

+63-20
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import numpy as np
44
import pytest
55

6+
from pandas.core.dtypes.cast import find_common_type, is_dtype_equal
7+
68
import pandas as pd
79
from pandas import DataFrame, Index, MultiIndex, Series
810
import pandas._testing as tm
@@ -18,9 +20,7 @@ def test_combine_first_mixed(self):
1820
b = Series(range(2), index=range(5, 7))
1921
g = DataFrame({"A": a, "B": b})
2022

21-
exp = DataFrame(
22-
{"A": list("abab"), "B": [0.0, 1.0, 0.0, 1.0]}, index=[0, 1, 5, 6]
23-
)
23+
exp = DataFrame({"A": list("abab"), "B": [0, 1, 0, 1]}, index=[0, 1, 5, 6])
2424
combined = f.combine_first(g)
2525
tm.assert_frame_equal(combined, exp)
2626

@@ -144,7 +144,7 @@ def test_combine_first_return_obj_type_with_bools(self):
144144
)
145145
df2 = DataFrame([[-42.6, np.nan, True], [-5.0, 1.6, False]], index=[1, 2])
146146

147-
expected = Series([True, True, False], name=2, dtype=object)
147+
expected = Series([True, True, False], name=2, dtype=bool)
148148

149149
result_12 = df1.combine_first(df2)[2]
150150
tm.assert_series_equal(result_12, expected)
@@ -157,22 +157,22 @@ def test_combine_first_return_obj_type_with_bools(self):
157157
(
158158
(
159159
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
160-
[None, None, None],
160+
[pd.NaT, pd.NaT, pd.NaT],
161161
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
162162
),
163163
(
164-
[None, None, None],
164+
[pd.NaT, pd.NaT, pd.NaT],
165165
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
166166
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
167167
),
168168
(
169-
[datetime(2000, 1, 2), None, None],
169+
[datetime(2000, 1, 2), pd.NaT, pd.NaT],
170170
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
171171
[datetime(2000, 1, 2), datetime(2000, 1, 2), datetime(2000, 1, 3)],
172172
),
173173
(
174174
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
175-
[datetime(2000, 1, 2), None, None],
175+
[datetime(2000, 1, 2), pd.NaT, pd.NaT],
176176
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
177177
),
178178
),
@@ -196,13 +196,13 @@ def test_combine_first_align_nan(self):
196196

197197
res = dfa.combine_first(dfb)
198198
exp = DataFrame(
199-
{"a": [pd.Timestamp("2011-01-01"), pd.NaT], "b": [2.0, 5.0]},
199+
{"a": [pd.Timestamp("2011-01-01"), pd.NaT], "b": [2, 5]},
200200
columns=["a", "b"],
201201
)
202202
tm.assert_frame_equal(res, exp)
203203
assert res["a"].dtype == "datetime64[ns]"
204204
# ToDo: this must be int64
205-
assert res["b"].dtype == "float64"
205+
assert res["b"].dtype == "int64"
206206

207207
res = dfa.iloc[:0].combine_first(dfb)
208208
exp = DataFrame({"a": [np.nan, np.nan], "b": [4, 5]}, columns=["a", "b"])
@@ -219,14 +219,12 @@ def test_combine_first_timezone(self):
219219
columns=["UTCdatetime", "abc"],
220220
data=data1,
221221
index=pd.date_range("20140627", periods=1),
222-
dtype="object",
223222
)
224223
data2 = pd.to_datetime("20121212 12:12").tz_localize("UTC")
225224
df2 = DataFrame(
226225
columns=["UTCdatetime", "xyz"],
227226
data=data2,
228227
index=pd.date_range("20140628", periods=1),
229-
dtype="object",
230228
)
231229
res = df2[["UTCdatetime"]].combine_first(df1)
232230
exp = DataFrame(
@@ -239,13 +237,10 @@ def test_combine_first_timezone(self):
239237
},
240238
columns=["UTCdatetime", "abc"],
241239
index=pd.date_range("20140627", periods=2, freq="D"),
242-
dtype="object",
243240
)
244241
assert res["UTCdatetime"].dtype == "datetime64[ns, UTC]"
245242
assert res["abc"].dtype == "datetime64[ns, UTC]"
246-
# Need to cast all to "obejct" because combine_first does not retain dtypes:
247-
# GH Issue 7509
248-
res = res.astype("object")
243+
249244
tm.assert_frame_equal(res, exp)
250245

251246
# see gh-10567
@@ -360,12 +355,11 @@ def test_combine_first_int(self):
360355
df2 = DataFrame({"a": [1, 4]}, dtype="int64")
361356

362357
result_12 = df1.combine_first(df2)
363-
expected_12 = DataFrame({"a": [0, 1, 3, 5]}, dtype="float64")
358+
expected_12 = DataFrame({"a": [0, 1, 3, 5]})
364359
tm.assert_frame_equal(result_12, expected_12)
365360

366361
result_21 = df2.combine_first(df1)
367-
expected_21 = DataFrame({"a": [1, 4, 3, 5]}, dtype="float64")
368-
362+
expected_21 = DataFrame({"a": [1, 4, 3, 5]})
369363
tm.assert_frame_equal(result_21, expected_21)
370364

371365
@pytest.mark.parametrize("val", [1, 1.0])
@@ -404,11 +398,38 @@ def test_combine_first_string_dtype_only_na(self):
404398
def test_combine_first_timestamp_bug(scalar1, scalar2, nulls_fixture):
405399
# GH28481
406400
na_value = nulls_fixture
401+
407402
frame = DataFrame([[na_value, na_value]], columns=["a", "b"])
408403
other = DataFrame([[scalar1, scalar2]], columns=["b", "c"])
409404

405+
common_dtype = find_common_type([frame.dtypes["b"], other.dtypes["b"]])
406+
407+
if is_dtype_equal(common_dtype, "object") or frame.dtypes["b"] == other.dtypes["b"]:
408+
val = scalar1
409+
else:
410+
val = na_value
411+
412+
result = frame.combine_first(other)
413+
414+
expected = DataFrame([[na_value, val, scalar2]], columns=["a", "b", "c"])
415+
416+
expected["b"] = expected["b"].astype(common_dtype)
417+
418+
tm.assert_frame_equal(result, expected)
419+
420+
421+
def test_combine_first_timestamp_bug_NaT():
422+
# GH28481
423+
frame = DataFrame([[pd.NaT, pd.NaT]], columns=["a", "b"])
424+
other = DataFrame(
425+
[[datetime(2020, 1, 1), datetime(2020, 1, 2)]], columns=["b", "c"]
426+
)
427+
410428
result = frame.combine_first(other)
411-
expected = DataFrame([[na_value, scalar1, scalar2]], columns=["a", "b", "c"])
429+
expected = DataFrame(
430+
[[pd.NaT, datetime(2020, 1, 1), datetime(2020, 1, 2)]], columns=["a", "b", "c"]
431+
)
432+
412433
tm.assert_frame_equal(result, expected)
413434

414435

@@ -439,3 +460,25 @@ def test_combine_first_with_nan_multiindex():
439460
index=mi_expected,
440461
)
441462
tm.assert_frame_equal(res, expected)
463+
464+
465+
def test_combine_preserve_dtypes():
466+
# GH7509
467+
a_column = Series(["a", "b"], index=range(2))
468+
b_column = Series(range(2), index=range(2))
469+
df1 = DataFrame({"A": a_column, "B": b_column})
470+
471+
c_column = Series(["a", "b"], index=range(5, 7))
472+
b_column = Series(range(-1, 1), index=range(5, 7))
473+
df2 = DataFrame({"B": b_column, "C": c_column})
474+
475+
expected = DataFrame(
476+
{
477+
"A": ["a", "b", np.nan, np.nan],
478+
"B": [0, 1, -1, 0],
479+
"C": [np.nan, np.nan, "a", "b"],
480+
},
481+
index=[0, 1, 5, 6],
482+
)
483+
combined = df1.combine_first(df2)
484+
tm.assert_frame_equal(combined, expected)

0 commit comments

Comments
 (0)