Skip to content

PERF: Series.fillna for pyarrow-backed dtypes #49722

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Nov 22, 2022
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions asv_bench/benchmarks/series_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,48 @@ def time_dropna(self, dtype):
self.s.dropna()


class Fillna:

params = [
[
"datetime64[ns]",
"float64",
"Int64",
"int64[pyarrow]",
"string",
"string[pyarrow]",
],
[None, "pad", "backfill"],
]
param_names = ["dtype", "method"]

def setup(self, dtype, method):
N = 10**6
if dtype == "datetime64[ns]":
data = date_range("2000-01-01", freq="S", periods=N)
na_value = NaT
elif dtype == "float64":
data = np.random.randn(N)
na_value = np.nan
elif dtype in ("Int64", "int64[pyarrow]"):
data = np.arange(N)
na_value = NA
elif dtype in ("string", "string[pyarrow]"):
data = tm.rands_array(5, N)
na_value = NA
else:
raise NotImplementedError
fill_value = data[0]
ser = Series(data, dtype=dtype)
ser[::2] = na_value
self.ser = ser
self.fill_value = fill_value

def time_fillna(self, dtype, method):
value = self.fill_value if method is None else None
self.ser.fillna(value=value, method=method)


class SearchSorted:

goal_time = 0.2
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -573,6 +573,7 @@ Performance improvements
- Performance improvement in :meth:`.DataFrameGroupBy.mean`, :meth:`.SeriesGroupBy.mean`, :meth:`.DataFrameGroupBy.var`, and :meth:`.SeriesGroupBy.var` for extension array dtypes (:issue:`37493`)
- Performance improvement in :meth:`MultiIndex.isin` when ``level=None`` (:issue:`48622`, :issue:`49577`)
- Performance improvement in :meth:`Index.union` and :meth:`MultiIndex.union` when index contains duplicates (:issue:`48900`)
- Performance improvement in :meth:`Series.fillna` for pyarrow-backed dtypes (:issue:`49722`)
- Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`)
- Performance improvement for :class:`Series` constructor passing integer numpy array with nullable dtype (:issue:`48338`)
- Performance improvement for :class:`DatetimeIndex` constructor passing a list (:issue:`48609`)
Expand Down
55 changes: 55 additions & 0 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,15 @@
TYPE_CHECKING,
Any,
TypeVar,
cast,
)

import numpy as np

from pandas._typing import (
ArrayLike,
Dtype,
FillnaOptions,
PositionalIndexer,
SortKind,
TakeIndexer,
Expand All @@ -20,6 +23,7 @@
pa_version_under7p0,
)
from pandas.util._decorators import doc
from pandas.util._validators import validate_fillna_kwargs

from pandas.core.dtypes.common import (
is_array_like,
Expand Down Expand Up @@ -521,6 +525,57 @@ def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
else:
return type(self)(pc.drop_null(self._data))

@doc(ExtensionArray.fillna)
def fillna(
self: ArrowExtensionArrayT,
value: object | ArrayLike | None = None,
method: FillnaOptions | None = None,
limit: int | None = None,
) -> ArrowExtensionArrayT:

value, method = validate_fillna_kwargs(value, method)

if is_array_like(value):
value = cast(ArrayLike, value)
if len(value) != len(self):
raise ValueError(
f"Length of 'value' does not match. Got ({len(value)}) "
f" expected {len(self)}"
)

def convert_fill_value(value, pa_type, dtype):
if value is None:
return value
if isinstance(value, (pa.Scalar, pa.Array, pa.ChunkedArray)):
return value
if is_array_like(value):
func = pa.array
else:
func = pa.scalar
try:
value = func(value, type=pa_type, from_pandas=True)
except pa.ArrowTypeError:
raise TypeError(f"Invalid value '{str(value)}' for dtype {dtype}")
return value

fill_value = convert_fill_value(value, self._data.type, self.dtype)

try:
if method is None and limit is None:
return type(self)(pc.fill_null(self._data, fill_value=fill_value))
elif method == "pad" and limit is None and not pa_version_under7p0:
return type(self)(pc.fill_null_forward(self._data))
elif method == "backfill" and limit is None and not pa_version_under7p0:
return type(self)(pc.fill_null_backward(self._data))
except pa.ArrowNotImplementedError:
# ArrowNotImplementedError: Function 'coalesce' has no kernel
# matching input types (duration[ns], duration[ns])
# TODO: remove this except case if/when pyarrow implements a
# kernel for duration types.
pass

return super().fillna(value=value, method=method, limit=limit)

def isin(self, values) -> npt.NDArray[np.bool_]:
# short-circuit to return all False array.
if not len(values):
Expand Down
17 changes: 7 additions & 10 deletions pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,14 +408,6 @@ def test_min_max_numpy(method, box, dtype, request):
def test_fillna_args(dtype, request):
# GH 37987

if dtype.storage == "pyarrow":
reason = (
"Regex pattern \"Cannot set non-string value '1' into "
"a StringArray.\" does not match 'Scalar must be NA or str'"
)
mark = pytest.mark.xfail(raises=AssertionError, reason=reason)
request.node.add_marker(mark)

arr = pd.array(["a", pd.NA], dtype=dtype)

res = arr.fillna(value="b")
Expand All @@ -426,8 +418,13 @@ def test_fillna_args(dtype, request):
expected = pd.array(["a", "b"], dtype=dtype)
tm.assert_extension_array_equal(res, expected)

msg = "Cannot set non-string value '1' into a StringArray."
with pytest.raises(ValueError, match=msg):
if dtype.storage == "pyarrow":
err = TypeError
msg = "Invalid value '1' for dtype string"
else:
err = ValueError
msg = "Cannot set non-string value '1' into a StringArray."
with pytest.raises(err, match=msg):
arr.fillna(value=1)


Expand Down