diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 47a61d41b2440..e9c091b14810b 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -115,6 +115,7 @@ Performance improvements - Performance improvement in :meth:`~arrays.ArrowExtensionArray.isna` when array has zero nulls or is all nulls (:issue:`51630`) - Performance improvement when parsing strings to ``boolean[pyarrow]`` dtype (:issue:`51730`) - Performance improvement when searching an :class:`Index` sliced from other indexes (:issue:`51738`) +- Performance improvement in :meth:`Series.combine_first` (:issue:`51777`) .. --------------------------------------------------------------------------- .. _whatsnew_210.bug_fixes: @@ -203,6 +204,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ - Bug in :meth:`DataFrame.transpose` inferring dtype for object column (:issue:`51546`) +- Bug in :meth:`Series.combine_first` converting ``int64`` dtype to ``float64`` and losing precision on very large integers (:issue:`51764`) - Sparse diff --git a/pandas/core/series.py b/pandas/core/series.py index 25cb18b0135fa..38a5d94db207c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3191,13 +3191,23 @@ def combine_first(self, other) -> Series: falcon NaN dtype: float64 """ + from pandas.core.reshape.concat import concat + new_index = self.index.union(other.index) - this = self.reindex(new_index, copy=False) - other = other.reindex(new_index, copy=False) + + this = self + # identify the index subset to keep for each series + keep_other = other.index.difference(this.index[notna(this)]) + keep_this = this.index.difference(keep_other) + + this = this.reindex(keep_this, copy=False) + other = other.reindex(keep_other, copy=False) + if this.dtype.kind == "M" and other.dtype.kind != "M": other = to_datetime(other) - - return this.where(notna(this), other) + combined = concat([this, other]) + combined = combined.reindex(new_index, copy=False) + return combined.__finalize__(self, method="combine_first") def update(self, other: Series | Sequence | Mapping) -> None: """ diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 2aeb2af567ea0..836e644affbda 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -327,13 +327,6 @@ def test_where_series(self, data, na_value): self.assert_series_equal(result, expected) def test_combine_first(self, data, request): - if data.dtype.subtype == "int": - # Right now this is upcasted to float, just like combine_first - # for Series[int] - mark = pytest.mark.xfail( - reason="TODO(SparseArray.__setitem__) will preserve dtype." - ) - request.node.add_marker(mark) super().test_combine_first(data) def test_searchsorted(self, data_for_sorting, as_series): diff --git a/pandas/tests/series/methods/test_combine_first.py b/pandas/tests/series/methods/test_combine_first.py index aac3d4986e8ee..25a3dfe42e843 100644 --- a/pandas/tests/series/methods/test_combine_first.py +++ b/pandas/tests/series/methods/test_combine_first.py @@ -112,3 +112,11 @@ def test_combine_first_timezone_series_with_empty_series(self): s2 = Series(index=time_index) result = s1.combine_first(s2) tm.assert_series_equal(result, s1) + + def test_combine_first_preserves_dtype(self): + # GH51764 + s1 = Series([1666880195890293744, 1666880195890293837]) + s2 = Series([1, 2, 3]) + result = s1.combine_first(s2) + expected = Series([1666880195890293744, 1666880195890293837, 3]) + tm.assert_series_equal(result, expected)