Skip to content

Commit 4b22372

Browse files
committed
Merge pull request #3043 from jreback/combine_3016
BUG: frame combine_first where non-specified values could cause dtype changes (#3041)
2 parents 1fe657a + 899c147 commit 4b22372

File tree

3 files changed

+39
-4
lines changed

3 files changed

+39
-4
lines changed

RELEASE.rst

+2
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ pandas 0.11.0
146146
- Bug in DataFrame column insertion when the column creation fails, existing frame is left in
147147
an irrecoverable state (GH3010_)
148148
- Bug in DataFrame update where non-specified values could cause dtype changes (GH3016_)
149+
- Bug in DataFrame combine_first where non-specified values could cause dtype changes (GH3041_)
149150
- Formatting of an index that has ``nan`` was inconsistent or wrong (would fill from
150151
other values), (GH2850_)
151152
- Unstack of a frame with no nans would always cause dtype upcasting (GH2929_)
@@ -180,6 +181,7 @@ pandas 0.11.0
180181
.. _GH3010: https://github.com/pydata/pandas/issues/3010
181182
.. _GH3012: https://github.com/pydata/pandas/issues/3012
182183
.. _GH3029: https://github.com/pydata/pandas/issues/3029
184+
.. _GH3041: https://github.com/pydata/pandas/issues/3041
183185

184186

185187
pandas 0.10.1

pandas/core/frame.py

+13-4
Original file line numberDiff line numberDiff line change
@@ -3723,7 +3723,7 @@ def _compare(a, b):
37233723
return self._constructor(data=new_data, index=self.index,
37243724
columns=self.columns, copy=False)
37253725

3726-
def combine(self, other, func, fill_value=None):
3726+
def combine(self, other, func, fill_value=None, overwrite=True):
37273727
"""
37283728
Add two DataFrame objects and do not propagate NaN values, so if for a
37293729
(column, time) one frame is missing a value, it will default to the
@@ -3734,6 +3734,8 @@ def combine(self, other, func, fill_value=None):
37343734
other : DataFrame
37353735
func : function
37363736
fill_value : scalar value
3737+
overwrite : boolean, default True
3738+
If True then overwrite values for common keys in the calling frame
37373739
37383740
Returns
37393741
-------
@@ -3760,9 +3762,16 @@ def combine(self, other, func, fill_value=None):
37603762
series = this[col].values
37613763
otherSeries = other[col].values
37623764

3765+
this_mask = isnull(series)
3766+
other_mask = isnull(otherSeries)
3767+
3768+
# don't overwrite columns unecessarily
3769+
# DO propogate if this column is not in the intersection
3770+
if not overwrite and other_mask.all():
3771+
result[col] = this[col].copy()
3772+
continue
3773+
37633774
if do_fill:
3764-
this_mask = isnull(series)
3765-
other_mask = isnull(otherSeries)
37663775
series = series.copy()
37673776
otherSeries = otherSeries.copy()
37683777
series[this_mask] = fill_value
@@ -3798,7 +3807,7 @@ def combine_first(self, other):
37983807
combined : DataFrame
37993808
"""
38003809
combiner = lambda x, y: np.where(isnull(x), y, x)
3801-
return self.combine(other, combiner)
3810+
return self.combine(other, combiner, overwrite=False)
38023811

38033812
def update(self, other, join='left', overwrite=True, filter_func=None,
38043813
raise_conflict=False):

pandas/tests/test_frame.py

+24
Original file line numberDiff line numberDiff line change
@@ -7248,6 +7248,30 @@ def test_combine_first_mixed_bug(self):
72487248
combined = frame1.combine_first(frame2)
72497249
self.assertEqual(len(combined.columns), 5)
72507250

7251+
# gh 3016 (same as in update)
7252+
df = DataFrame([[1.,2.,False, True],[4.,5.,True,False]],
7253+
columns=['A','B','bool1','bool2'])
7254+
7255+
other = DataFrame([[45,45]],index=[0],columns=['A','B'])
7256+
result = df.combine_first(other)
7257+
assert_frame_equal(result, df)
7258+
7259+
df.ix[0,'A'] = np.nan
7260+
result = df.combine_first(other)
7261+
df.ix[0,'A'] = 45
7262+
assert_frame_equal(result, df)
7263+
7264+
# doc example
7265+
df1 = DataFrame({'A' : [1., np.nan, 3., 5., np.nan],
7266+
'B' : [np.nan, 2., 3., np.nan, 6.]})
7267+
7268+
df2 = DataFrame({'A' : [5., 2., 4., np.nan, 3., 7.],
7269+
'B' : [np.nan, np.nan, 3., 4., 6., 8.]})
7270+
7271+
result = df1.combine_first(df2)
7272+
expected = DataFrame({ 'A' : [1,2,3,5,3,7.], 'B' : [np.nan,2,3,4,6,8] })
7273+
assert_frame_equal(result,expected)
7274+
72517275
def test_update(self):
72527276
df = DataFrame([[1.5, nan, 3.],
72537277
[1.5, nan, 3.],

0 commit comments

Comments
 (0)