Skip to content

Commit 78947dd

Browse files
authored
BUG/PERF: Series.combine_first converting int64 to float64 (#51777)
* Series.combine_first to preserve dtype * avoid loss of precision * whatsnew * fix * simplify * update test
1 parent 9de579e commit 78947dd

File tree

4 files changed

+24
-11
lines changed

4 files changed

+24
-11
lines changed

doc/source/whatsnew/v2.1.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ Performance improvements
115115
- Performance improvement in :meth:`~arrays.ArrowExtensionArray.isna` when array has zero nulls or is all nulls (:issue:`51630`)
116116
- Performance improvement when parsing strings to ``boolean[pyarrow]`` dtype (:issue:`51730`)
117117
- Performance improvement when searching an :class:`Index` sliced from other indexes (:issue:`51738`)
118+
- Performance improvement in :meth:`Series.combine_first` (:issue:`51777`)
118119

119120
.. ---------------------------------------------------------------------------
120121
.. _whatsnew_210.bug_fixes:
@@ -204,6 +205,7 @@ Reshaping
204205
^^^^^^^^^
205206
- Bug in :meth:`DataFrame.stack` losing extension dtypes when columns is a :class:`MultiIndex` and frame contains mixed dtypes (:issue:`45740`)
206207
- Bug in :meth:`DataFrame.transpose` inferring dtype for object column (:issue:`51546`)
208+
- Bug in :meth:`Series.combine_first` converting ``int64`` dtype to ``float64`` and losing precision on very large integers (:issue:`51764`)
207209
-
208210

209211
Sparse

pandas/core/series.py

+14-4
Original file line numberDiff line numberDiff line change
@@ -3191,13 +3191,23 @@ def combine_first(self, other) -> Series:
31913191
falcon NaN
31923192
dtype: float64
31933193
"""
3194+
from pandas.core.reshape.concat import concat
3195+
31943196
new_index = self.index.union(other.index)
3195-
this = self.reindex(new_index, copy=False)
3196-
other = other.reindex(new_index, copy=False)
3197+
3198+
this = self
3199+
# identify the index subset to keep for each series
3200+
keep_other = other.index.difference(this.index[notna(this)])
3201+
keep_this = this.index.difference(keep_other)
3202+
3203+
this = this.reindex(keep_this, copy=False)
3204+
other = other.reindex(keep_other, copy=False)
3205+
31973206
if this.dtype.kind == "M" and other.dtype.kind != "M":
31983207
other = to_datetime(other)
3199-
3200-
return this.where(notna(this), other)
3208+
combined = concat([this, other])
3209+
combined = combined.reindex(new_index, copy=False)
3210+
return combined.__finalize__(self, method="combine_first")
32013211

32023212
def update(self, other: Series | Sequence | Mapping) -> None:
32033213
"""

pandas/tests/extension/test_sparse.py

-7
Original file line numberDiff line numberDiff line change
@@ -327,13 +327,6 @@ def test_where_series(self, data, na_value):
327327
self.assert_series_equal(result, expected)
328328

329329
def test_combine_first(self, data, request):
330-
if data.dtype.subtype == "int":
331-
# Right now this is upcasted to float, just like combine_first
332-
# for Series[int]
333-
mark = pytest.mark.xfail(
334-
reason="TODO(SparseArray.__setitem__) will preserve dtype."
335-
)
336-
request.node.add_marker(mark)
337330
super().test_combine_first(data)
338331

339332
def test_searchsorted(self, data_for_sorting, as_series):

pandas/tests/series/methods/test_combine_first.py

+8
Original file line numberDiff line numberDiff line change
@@ -112,3 +112,11 @@ def test_combine_first_timezone_series_with_empty_series(self):
112112
s2 = Series(index=time_index)
113113
result = s1.combine_first(s2)
114114
tm.assert_series_equal(result, s1)
115+
116+
def test_combine_first_preserves_dtype(self):
117+
# GH51764
118+
s1 = Series([1666880195890293744, 1666880195890293837])
119+
s2 = Series([1, 2, 3])
120+
result = s1.combine_first(s2)
121+
expected = Series([1666880195890293744, 1666880195890293837, 3])
122+
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)