diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 75fbedc5f4d9c..18c5be3b421f9 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -79,6 +79,48 @@ def time_dropna(self, dtype): self.s.dropna() +class Fillna: + + params = [ + [ + "datetime64[ns]", + "float64", + "Int64", + "int64[pyarrow]", + "string", + "string[pyarrow]", + ], + [None, "pad", "backfill"], + ] + param_names = ["dtype", "method"] + + def setup(self, dtype, method): + N = 10**6 + if dtype == "datetime64[ns]": + data = date_range("2000-01-01", freq="S", periods=N) + na_value = NaT + elif dtype == "float64": + data = np.random.randn(N) + na_value = np.nan + elif dtype in ("Int64", "int64[pyarrow]"): + data = np.arange(N) + na_value = NA + elif dtype in ("string", "string[pyarrow]"): + data = tm.rands_array(5, N) + na_value = NA + else: + raise NotImplementedError + fill_value = data[0] + ser = Series(data, dtype=dtype) + ser[::2] = na_value + self.ser = ser + self.fill_value = fill_value + + def time_fillna(self, dtype, method): + value = self.fill_value if method is None else None + self.ser.fillna(value=value, method=method) + + class SearchSorted: goal_time = 0.2 diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 1ca513e8f5e6a..acce5a20c12ac 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -573,6 +573,7 @@ Performance improvements - Performance improvement in :meth:`.DataFrameGroupBy.mean`, :meth:`.SeriesGroupBy.mean`, :meth:`.DataFrameGroupBy.var`, and :meth:`.SeriesGroupBy.var` for extension array dtypes (:issue:`37493`) - Performance improvement in :meth:`MultiIndex.isin` when ``level=None`` (:issue:`48622`, :issue:`49577`) - Performance improvement in :meth:`Index.union` and :meth:`MultiIndex.union` when index contains duplicates (:issue:`48900`) +- Performance improvement in :meth:`Series.fillna` for pyarrow-backed dtypes (:issue:`49722`) - Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`) - Performance improvement for :class:`Series` constructor passing integer numpy array with nullable dtype (:issue:`48338`) - Performance improvement for :class:`DatetimeIndex` constructor passing a list (:issue:`48609`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 06d91730804ab..8d34ce1d29817 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -4,12 +4,15 @@ TYPE_CHECKING, Any, TypeVar, + cast, ) import numpy as np from pandas._typing import ( + ArrayLike, Dtype, + FillnaOptions, PositionalIndexer, SortKind, TakeIndexer, @@ -20,6 +23,7 @@ pa_version_under7p0, ) from pandas.util._decorators import doc +from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.common import ( is_array_like, @@ -521,6 +525,66 @@ def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: else: return type(self)(pc.drop_null(self._data)) + @doc(ExtensionArray.fillna) + def fillna( + self: ArrowExtensionArrayT, + value: object | ArrayLike | None = None, + method: FillnaOptions | None = None, + limit: int | None = None, + ) -> ArrowExtensionArrayT: + + value, method = validate_fillna_kwargs(value, method) + + if limit is not None: + return super().fillna(value=value, method=method, limit=limit) + + if method is not None and pa_version_under7p0: + # fill_null_{forward|backward} added in pyarrow 7.0 + fallback_performancewarning(version="7") + return super().fillna(value=value, method=method, limit=limit) + + if is_array_like(value): + value = cast(ArrayLike, value) + if len(value) != len(self): + raise ValueError( + f"Length of 'value' does not match. Got ({len(value)}) " + f" expected {len(self)}" + ) + + def convert_fill_value(value, pa_type, dtype): + if value is None: + return value + if isinstance(value, (pa.Scalar, pa.Array, pa.ChunkedArray)): + return value + if is_array_like(value): + pa_box = pa.array + else: + pa_box = pa.scalar + try: + value = pa_box(value, type=pa_type, from_pandas=True) + except pa.ArrowTypeError as err: + msg = f"Invalid value '{str(value)}' for dtype {dtype}" + raise TypeError(msg) from err + return value + + fill_value = convert_fill_value(value, self._data.type, self.dtype) + + try: + if method is None: + return type(self)(pc.fill_null(self._data, fill_value=fill_value)) + elif method == "pad": + return type(self)(pc.fill_null_forward(self._data)) + elif method == "backfill": + return type(self)(pc.fill_null_backward(self._data)) + except pa.ArrowNotImplementedError: + # ArrowNotImplementedError: Function 'coalesce' has no kernel + # matching input types (duration[ns], duration[ns]) + # TODO: remove try/except wrapper if/when pyarrow implements + # a kernel for duration types. + pass + + return super().fillna(value=value, method=method, limit=limit) + def isin(self, values) -> npt.NDArray[np.bool_]: # short-circuit to return all False array. if not len(values): diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index e321e8da15a6e..8d8d9ce20cefd 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -408,14 +408,6 @@ def test_min_max_numpy(method, box, dtype, request): def test_fillna_args(dtype, request): # GH 37987 - if dtype.storage == "pyarrow": - reason = ( - "Regex pattern \"Cannot set non-string value '1' into " - "a StringArray.\" does not match 'Scalar must be NA or str'" - ) - mark = pytest.mark.xfail(raises=AssertionError, reason=reason) - request.node.add_marker(mark) - arr = pd.array(["a", pd.NA], dtype=dtype) res = arr.fillna(value="b") @@ -426,8 +418,13 @@ def test_fillna_args(dtype, request): expected = pd.array(["a", "b"], dtype=dtype) tm.assert_extension_array_equal(res, expected) - msg = "Cannot set non-string value '1' into a StringArray." - with pytest.raises(ValueError, match=msg): + if dtype.storage == "pyarrow": + err = TypeError + msg = "Invalid value '1' for dtype string" + else: + err = ValueError + msg = "Cannot set non-string value '1' into a StringArray." + with pytest.raises(err, match=msg): arr.fillna(value=1) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index d094a7731c417..85b7784895655 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -632,6 +632,18 @@ class TestBaseMissing(base.BaseMissingTests): def test_dropna_array(self, data_missing): super().test_dropna_array(data_missing) + def test_fillna_no_op_returns_copy(self, data): + with tm.maybe_produces_warning( + PerformanceWarning, pa_version_under7p0, check_stacklevel=False + ): + super().test_fillna_no_op_returns_copy(data) + + def test_fillna_series_method(self, data_missing, fillna_method): + with tm.maybe_produces_warning( + PerformanceWarning, pa_version_under7p0, check_stacklevel=False + ): + super().test_fillna_series_method(data_missing, fillna_method) + class TestBasePrinting(base.BasePrintingTests): pass diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 8cbd4342ea13f..ecc69113882c5 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -168,6 +168,22 @@ def test_dropna_array(self, data_missing): expected = data_missing[[1]] self.assert_extension_array_equal(result, expected) + def test_fillna_no_op_returns_copy(self, data): + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under7p0 and data.dtype.storage == "pyarrow", + check_stacklevel=False, + ): + super().test_fillna_no_op_returns_copy(data) + + def test_fillna_series_method(self, data_missing, fillna_method): + with tm.maybe_produces_warning( + PerformanceWarning, + pa_version_under7p0 and data_missing.dtype.storage == "pyarrow", + check_stacklevel=False, + ): + super().test_fillna_series_method(data_missing, fillna_method) + class TestNoReduce(base.BaseNoReduceTests): @pytest.mark.parametrize("skipna", [True, False])