BUG/PERF: Series.combine_first converting int64 to float64 (#51777)

lukemanley · web-flow · commit 78947dd9fb06 · 2023-03-10T10:57:03.000-08:00
* Series.combine_first to preserve dtype

* avoid loss of precision

* whatsnew

* fix

* simplify

* update test
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -115,6 +115,7 @@ Performance improvements
 - Performance improvement in :meth:`~arrays.ArrowExtensionArray.isna` when array has zero nulls or is all nulls (:issue:`51630`)
 - Performance improvement when parsing strings to ``boolean[pyarrow]`` dtype (:issue:`51730`)
 - Performance improvement when searching an :class:`Index` sliced from other indexes (:issue:`51738`)
+- Performance improvement in :meth:`Series.combine_first` (:issue:`51777`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_210.bug_fixes:
@@ -204,6 +205,7 @@ Reshaping
 ^^^^^^^^^
 - Bug in :meth:`DataFrame.stack` losing extension dtypes when columns is a :class:`MultiIndex` and frame contains mixed dtypes (:issue:`45740`)
 - Bug in :meth:`DataFrame.transpose` inferring dtype for object column (:issue:`51546`)
+- Bug in :meth:`Series.combine_first` converting ``int64`` dtype to ``float64`` and losing precision on very large integers (:issue:`51764`)
 -
 
 Sparse
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -3191,13 +3191,23 @@ def combine_first(self, other) -> Series:
         falcon      NaN
         dtype: float64
         """
+        from pandas.core.reshape.concat import concat
+
         new_index = self.index.union(other.index)
-        this = self.reindex(new_index, copy=False)
-        other = other.reindex(new_index, copy=False)
+
+        this = self
+        # identify the index subset to keep for each series
+        keep_other = other.index.difference(this.index[notna(this)])
+        keep_this = this.index.difference(keep_other)
+
+        this = this.reindex(keep_this, copy=False)
+        other = other.reindex(keep_other, copy=False)
+
         if this.dtype.kind == "M" and other.dtype.kind != "M":
             other = to_datetime(other)
-
-        return this.where(notna(this), other)
+        combined = concat([this, other])
+        combined = combined.reindex(new_index, copy=False)
+        return combined.__finalize__(self, method="combine_first")
 
     def update(self, other: Series | Sequence | Mapping) -> None:
         """
diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py
@@ -327,13 +327,6 @@ def test_where_series(self, data, na_value):
         self.assert_series_equal(result, expected)
 
     def test_combine_first(self, data, request):
-        if data.dtype.subtype == "int":
-            # Right now this is upcasted to float, just like combine_first
-            # for Series[int]
-            mark = pytest.mark.xfail(
-                reason="TODO(SparseArray.__setitem__) will preserve dtype."
-            )
-            request.node.add_marker(mark)
         super().test_combine_first(data)
 
     def test_searchsorted(self, data_for_sorting, as_series):
diff --git a/pandas/tests/series/methods/test_combine_first.py b/pandas/tests/series/methods/test_combine_first.py
@@ -112,3 +112,11 @@ def test_combine_first_timezone_series_with_empty_series(self):
         s2 = Series(index=time_index)
         result = s1.combine_first(s2)
         tm.assert_series_equal(result, s1)
+
+    def test_combine_first_preserves_dtype(self):
+        # GH51764
+        s1 = Series([1666880195890293744, 1666880195890293837])
+        s2 = Series([1, 2, 3])
+        result = s1.combine_first(s2)
+        expected = Series([1666880195890293744, 1666880195890293837, 3])
+        tm.assert_series_equal(result, expected)