Skip to content

Commit aad85ad

Browse files
authored
BUG: combine_first does not retain dtypes with Timestamp DataFrames (#38145)
1 parent dda8c35 commit aad85ad

File tree

3 files changed

+76
-30
lines changed

3 files changed

+76
-30
lines changed

doc/source/whatsnew/v1.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -554,6 +554,7 @@ Categorical
554554

555555
Datetimelike
556556
^^^^^^^^^^^^
557+
- Bug in :meth:`DataFrame.combine_first` that would convert datetime-like column on other :class:`DataFrame` to integer when the column is not present in original :class:`DataFrame` (:issue:`28481`)
557558
- Bug in :attr:`.DatetimeArray.date` where a ``ValueError`` would be raised with a read-only backing array (:issue:`33530`)
558559
- Bug in ``NaT`` comparisons failing to raise ``TypeError`` on invalid inequality comparisons (:issue:`35046`)
559560
- Bug in :class:`.DateOffset` where attributes reconstructed from pickle files differ from original objects when input values exceed normal ranges (e.g months=12) (:issue:`34511`)

pandas/core/frame.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -6386,7 +6386,7 @@ def combine(
63866386
otherSeries = otherSeries.astype(new_dtype)
63876387

63886388
arr = func(series, otherSeries)
6389-
arr = maybe_downcast_to_dtype(arr, this_dtype)
6389+
arr = maybe_downcast_to_dtype(arr, new_dtype)
63906390

63916391
result[col] = arr
63926392

pandas/tests/frame/methods/test_combine_first.py

+74-29
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ def test_combine_first_mixed_bug(self):
103103
combined = frame1.combine_first(frame2)
104104
assert len(combined.columns) == 5
105105

106+
def test_combine_first_same_as_in_update(self):
106107
# gh 3016 (same as in update)
107108
df = DataFrame(
108109
[[1.0, 2.0, False, True], [4.0, 5.0, True, False]],
@@ -118,6 +119,7 @@ def test_combine_first_mixed_bug(self):
118119
df.loc[0, "A"] = 45
119120
tm.assert_frame_equal(result, df)
120121

122+
def test_combine_first_doc_example(self):
121123
# doc example
122124
df1 = DataFrame(
123125
{"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]}
@@ -134,38 +136,56 @@ def test_combine_first_mixed_bug(self):
134136
expected = DataFrame({"A": [1, 2, 3, 5, 3, 7.0], "B": [np.nan, 2, 3, 4, 6, 8]})
135137
tm.assert_frame_equal(result, expected)
136138

137-
# GH3552, return object dtype with bools
139+
def test_combine_first_return_obj_type_with_bools(self):
140+
# GH3552
141+
138142
df1 = DataFrame(
139143
[[np.nan, 3.0, True], [-4.6, np.nan, True], [np.nan, 7.0, False]]
140144
)
141145
df2 = DataFrame([[-42.6, np.nan, True], [-5.0, 1.6, False]], index=[1, 2])
142146

143-
result = df1.combine_first(df2)[2]
144-
expected = Series([True, True, False], name=2)
145-
tm.assert_series_equal(result, expected)
146-
147-
# GH 3593, converting datetime64[ns] incorrectly
148-
df0 = DataFrame(
149-
{"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]}
150-
)
151-
df1 = DataFrame({"a": [None, None, None]})
152-
df2 = df1.combine_first(df0)
153-
tm.assert_frame_equal(df2, df0)
154-
155-
df2 = df0.combine_first(df1)
156-
tm.assert_frame_equal(df2, df0)
157-
158-
df0 = DataFrame(
159-
{"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]}
160-
)
161-
df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]})
162-
df2 = df1.combine_first(df0)
163-
result = df0.copy()
164-
result.iloc[0, :] = df1.iloc[0, :]
165-
tm.assert_frame_equal(df2, result)
147+
expected = Series([True, True, False], name=2, dtype=object)
148+
149+
result_12 = df1.combine_first(df2)[2]
150+
tm.assert_series_equal(result_12, expected)
151+
152+
result_21 = df2.combine_first(df1)[2]
153+
tm.assert_series_equal(result_21, expected)
154+
155+
@pytest.mark.parametrize(
156+
"data1, data2, data_expected",
157+
(
158+
(
159+
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
160+
[None, None, None],
161+
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
162+
),
163+
(
164+
[None, None, None],
165+
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
166+
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
167+
),
168+
(
169+
[datetime(2000, 1, 2), None, None],
170+
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
171+
[datetime(2000, 1, 2), datetime(2000, 1, 2), datetime(2000, 1, 3)],
172+
),
173+
(
174+
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
175+
[datetime(2000, 1, 2), None, None],
176+
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
177+
),
178+
),
179+
)
180+
def test_combine_first_convert_datatime_correctly(
181+
self, data1, data2, data_expected
182+
):
183+
# GH 3593
166184

167-
df2 = df0.combine_first(df1)
168-
tm.assert_frame_equal(df2, df0)
185+
df1, df2 = DataFrame({"a": data1}), DataFrame({"a": data2})
186+
result = df1.combine_first(df2)
187+
expected = DataFrame({"a": data_expected})
188+
tm.assert_frame_equal(result, expected)
169189

170190
def test_combine_first_align_nan(self):
171191
# GH 7509 (not fixed)
@@ -339,9 +359,14 @@ def test_combine_first_int(self):
339359
df1 = DataFrame({"a": [0, 1, 3, 5]}, dtype="int64")
340360
df2 = DataFrame({"a": [1, 4]}, dtype="int64")
341361

342-
res = df1.combine_first(df2)
343-
tm.assert_frame_equal(res, df1)
344-
assert res["a"].dtype == "int64"
362+
result_12 = df1.combine_first(df2)
363+
expected_12 = DataFrame({"a": [0, 1, 3, 5]}, dtype="float64")
364+
tm.assert_frame_equal(result_12, expected_12)
365+
366+
result_21 = df2.combine_first(df1)
367+
expected_21 = DataFrame({"a": [1, 4, 3, 5]}, dtype="float64")
368+
369+
tm.assert_frame_equal(result_21, expected_21)
345370

346371
@pytest.mark.parametrize("val", [1, 1.0])
347372
def test_combine_first_with_asymmetric_other(self, val):
@@ -367,6 +392,26 @@ def test_combine_first_string_dtype_only_na(self):
367392
tm.assert_frame_equal(result, expected)
368393

369394

395+
@pytest.mark.parametrize(
396+
"scalar1, scalar2",
397+
[
398+
(datetime(2020, 1, 1), datetime(2020, 1, 2)),
399+
(pd.Period("2020-01-01", "D"), pd.Period("2020-01-02", "D")),
400+
(pd.Timedelta("89 days"), pd.Timedelta("60 min")),
401+
(pd.Interval(left=0, right=1), pd.Interval(left=2, right=3, closed="left")),
402+
],
403+
)
404+
def test_combine_first_timestamp_bug(scalar1, scalar2, nulls_fixture):
405+
# GH28481
406+
na_value = nulls_fixture
407+
frame = DataFrame([[na_value, na_value]], columns=["a", "b"])
408+
other = DataFrame([[scalar1, scalar2]], columns=["b", "c"])
409+
410+
result = frame.combine_first(other)
411+
expected = DataFrame([[na_value, scalar1, scalar2]], columns=["a", "b", "c"])
412+
tm.assert_frame_equal(result, expected)
413+
414+
370415
def test_combine_first_with_nan_multiindex():
371416
# gh-36562
372417

0 commit comments

Comments
 (0)