3
3
import numpy as np
4
4
import pytest
5
5
6
+ from pandas .core .dtypes .cast import find_common_type , is_dtype_equal
7
+
6
8
import pandas as pd
7
9
from pandas import DataFrame , Index , MultiIndex , Series
8
10
import pandas ._testing as tm
@@ -18,9 +20,7 @@ def test_combine_first_mixed(self):
18
20
b = Series (range (2 ), index = range (5 , 7 ))
19
21
g = DataFrame ({"A" : a , "B" : b })
20
22
21
- exp = DataFrame (
22
- {"A" : list ("abab" ), "B" : [0.0 , 1.0 , 0.0 , 1.0 ]}, index = [0 , 1 , 5 , 6 ]
23
- )
23
+ exp = DataFrame ({"A" : list ("abab" ), "B" : [0 , 1 , 0 , 1 ]}, index = [0 , 1 , 5 , 6 ])
24
24
combined = f .combine_first (g )
25
25
tm .assert_frame_equal (combined , exp )
26
26
@@ -144,7 +144,7 @@ def test_combine_first_return_obj_type_with_bools(self):
144
144
)
145
145
df2 = DataFrame ([[- 42.6 , np .nan , True ], [- 5.0 , 1.6 , False ]], index = [1 , 2 ])
146
146
147
- expected = Series ([True , True , False ], name = 2 , dtype = object )
147
+ expected = Series ([True , True , False ], name = 2 , dtype = bool )
148
148
149
149
result_12 = df1 .combine_first (df2 )[2 ]
150
150
tm .assert_series_equal (result_12 , expected )
@@ -157,22 +157,22 @@ def test_combine_first_return_obj_type_with_bools(self):
157
157
(
158
158
(
159
159
[datetime (2000 , 1 , 1 ), datetime (2000 , 1 , 2 ), datetime (2000 , 1 , 3 )],
160
- [None , None , None ],
160
+ [pd . NaT , pd . NaT , pd . NaT ],
161
161
[datetime (2000 , 1 , 1 ), datetime (2000 , 1 , 2 ), datetime (2000 , 1 , 3 )],
162
162
),
163
163
(
164
- [None , None , None ],
164
+ [pd . NaT , pd . NaT , pd . NaT ],
165
165
[datetime (2000 , 1 , 1 ), datetime (2000 , 1 , 2 ), datetime (2000 , 1 , 3 )],
166
166
[datetime (2000 , 1 , 1 ), datetime (2000 , 1 , 2 ), datetime (2000 , 1 , 3 )],
167
167
),
168
168
(
169
- [datetime (2000 , 1 , 2 ), None , None ],
169
+ [datetime (2000 , 1 , 2 ), pd . NaT , pd . NaT ],
170
170
[datetime (2000 , 1 , 1 ), datetime (2000 , 1 , 2 ), datetime (2000 , 1 , 3 )],
171
171
[datetime (2000 , 1 , 2 ), datetime (2000 , 1 , 2 ), datetime (2000 , 1 , 3 )],
172
172
),
173
173
(
174
174
[datetime (2000 , 1 , 1 ), datetime (2000 , 1 , 2 ), datetime (2000 , 1 , 3 )],
175
- [datetime (2000 , 1 , 2 ), None , None ],
175
+ [datetime (2000 , 1 , 2 ), pd . NaT , pd . NaT ],
176
176
[datetime (2000 , 1 , 1 ), datetime (2000 , 1 , 2 ), datetime (2000 , 1 , 3 )],
177
177
),
178
178
),
@@ -196,13 +196,13 @@ def test_combine_first_align_nan(self):
196
196
197
197
res = dfa .combine_first (dfb )
198
198
exp = DataFrame (
199
- {"a" : [pd .Timestamp ("2011-01-01" ), pd .NaT ], "b" : [2.0 , 5.0 ]},
199
+ {"a" : [pd .Timestamp ("2011-01-01" ), pd .NaT ], "b" : [2 , 5 ]},
200
200
columns = ["a" , "b" ],
201
201
)
202
202
tm .assert_frame_equal (res , exp )
203
203
assert res ["a" ].dtype == "datetime64[ns]"
204
204
# ToDo: this must be int64
205
- assert res ["b" ].dtype == "float64 "
205
+ assert res ["b" ].dtype == "int64 "
206
206
207
207
res = dfa .iloc [:0 ].combine_first (dfb )
208
208
exp = DataFrame ({"a" : [np .nan , np .nan ], "b" : [4 , 5 ]}, columns = ["a" , "b" ])
@@ -219,14 +219,12 @@ def test_combine_first_timezone(self):
219
219
columns = ["UTCdatetime" , "abc" ],
220
220
data = data1 ,
221
221
index = pd .date_range ("20140627" , periods = 1 ),
222
- dtype = "object" ,
223
222
)
224
223
data2 = pd .to_datetime ("20121212 12:12" ).tz_localize ("UTC" )
225
224
df2 = DataFrame (
226
225
columns = ["UTCdatetime" , "xyz" ],
227
226
data = data2 ,
228
227
index = pd .date_range ("20140628" , periods = 1 ),
229
- dtype = "object" ,
230
228
)
231
229
res = df2 [["UTCdatetime" ]].combine_first (df1 )
232
230
exp = DataFrame (
@@ -239,13 +237,10 @@ def test_combine_first_timezone(self):
239
237
},
240
238
columns = ["UTCdatetime" , "abc" ],
241
239
index = pd .date_range ("20140627" , periods = 2 , freq = "D" ),
242
- dtype = "object" ,
243
240
)
244
241
assert res ["UTCdatetime" ].dtype == "datetime64[ns, UTC]"
245
242
assert res ["abc" ].dtype == "datetime64[ns, UTC]"
246
- # Need to cast all to "obejct" because combine_first does not retain dtypes:
247
- # GH Issue 7509
248
- res = res .astype ("object" )
243
+
249
244
tm .assert_frame_equal (res , exp )
250
245
251
246
# see gh-10567
@@ -360,12 +355,11 @@ def test_combine_first_int(self):
360
355
df2 = DataFrame ({"a" : [1 , 4 ]}, dtype = "int64" )
361
356
362
357
result_12 = df1 .combine_first (df2 )
363
- expected_12 = DataFrame ({"a" : [0 , 1 , 3 , 5 ]}, dtype = "float64" )
358
+ expected_12 = DataFrame ({"a" : [0 , 1 , 3 , 5 ]})
364
359
tm .assert_frame_equal (result_12 , expected_12 )
365
360
366
361
result_21 = df2 .combine_first (df1 )
367
- expected_21 = DataFrame ({"a" : [1 , 4 , 3 , 5 ]}, dtype = "float64" )
368
-
362
+ expected_21 = DataFrame ({"a" : [1 , 4 , 3 , 5 ]})
369
363
tm .assert_frame_equal (result_21 , expected_21 )
370
364
371
365
@pytest .mark .parametrize ("val" , [1 , 1.0 ])
@@ -404,11 +398,38 @@ def test_combine_first_string_dtype_only_na(self):
404
398
def test_combine_first_timestamp_bug (scalar1 , scalar2 , nulls_fixture ):
405
399
# GH28481
406
400
na_value = nulls_fixture
401
+
407
402
frame = DataFrame ([[na_value , na_value ]], columns = ["a" , "b" ])
408
403
other = DataFrame ([[scalar1 , scalar2 ]], columns = ["b" , "c" ])
409
404
405
+ common_dtype = find_common_type ([frame .dtypes ["b" ], other .dtypes ["b" ]])
406
+
407
+ if is_dtype_equal (common_dtype , "object" ) or frame .dtypes ["b" ] == other .dtypes ["b" ]:
408
+ val = scalar1
409
+ else :
410
+ val = na_value
411
+
412
+ result = frame .combine_first (other )
413
+
414
+ expected = DataFrame ([[na_value , val , scalar2 ]], columns = ["a" , "b" , "c" ])
415
+
416
+ expected ["b" ] = expected ["b" ].astype (common_dtype )
417
+
418
+ tm .assert_frame_equal (result , expected )
419
+
420
+
421
+ def test_combine_first_timestamp_bug_NaT ():
422
+ # GH28481
423
+ frame = DataFrame ([[pd .NaT , pd .NaT ]], columns = ["a" , "b" ])
424
+ other = DataFrame (
425
+ [[datetime (2020 , 1 , 1 ), datetime (2020 , 1 , 2 )]], columns = ["b" , "c" ]
426
+ )
427
+
410
428
result = frame .combine_first (other )
411
- expected = DataFrame ([[na_value , scalar1 , scalar2 ]], columns = ["a" , "b" , "c" ])
429
+ expected = DataFrame (
430
+ [[pd .NaT , datetime (2020 , 1 , 1 ), datetime (2020 , 1 , 2 )]], columns = ["a" , "b" , "c" ]
431
+ )
432
+
412
433
tm .assert_frame_equal (result , expected )
413
434
414
435
@@ -439,3 +460,25 @@ def test_combine_first_with_nan_multiindex():
439
460
index = mi_expected ,
440
461
)
441
462
tm .assert_frame_equal (res , expected )
463
+
464
+
465
+ def test_combine_preserve_dtypes ():
466
+ # GH7509
467
+ a_column = Series (["a" , "b" ], index = range (2 ))
468
+ b_column = Series (range (2 ), index = range (2 ))
469
+ df1 = DataFrame ({"A" : a_column , "B" : b_column })
470
+
471
+ c_column = Series (["a" , "b" ], index = range (5 , 7 ))
472
+ b_column = Series (range (- 1 , 1 ), index = range (5 , 7 ))
473
+ df2 = DataFrame ({"B" : b_column , "C" : c_column })
474
+
475
+ expected = DataFrame (
476
+ {
477
+ "A" : ["a" , "b" , np .nan , np .nan ],
478
+ "B" : [0 , 1 , - 1 , 0 ],
479
+ "C" : [np .nan , np .nan , "a" , "b" ],
480
+ },
481
+ index = [0 , 1 , 5 , 6 ],
482
+ )
483
+ combined = df1 .combine_first (df2 )
484
+ tm .assert_frame_equal (combined , expected )
0 commit comments