Skip to content

Commit 8e70ba3

Browse files
jbrockmendelim-vinicius
authored and
im-vinicius
committed
PERF: ffill/bfill with non-numpy dtypes (pandas-dev#53950)
1 parent 24ff1ce commit 8e70ba3

File tree

7 files changed

+55
-26
lines changed

7 files changed

+55
-26
lines changed

doc/source/whatsnew/v2.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,7 @@ Performance improvements
342342
- Performance improvement in :meth:`DataFrame.loc` when selecting rows and columns (:issue:`53014`)
343343
- Performance improvement in :meth:`Series.add` for pyarrow string and binary dtypes (:issue:`53150`)
344344
- Performance improvement in :meth:`Series.corr` and :meth:`Series.cov` for extension dtypes (:issue:`52502`)
345+
- Performance improvement in :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, :meth:`DataFrame.bfill` with pyarrow dtypes (:issue:`53950`)
345346
- Performance improvement in :meth:`Series.str.get_dummies` for pyarrow-backed strings (:issue:`53655`)
346347
- Performance improvement in :meth:`Series.str.get` for pyarrow-backed strings (:issue:`53152`)
347348
- Performance improvement in :meth:`Series.str.split` with ``expand=True`` for pyarrow-backed strings (:issue:`53585`)

pandas/_libs/algos.pyi

+4
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,10 @@ def nancorr_spearman(
6060
# ----------------------------------------------------------------------
6161

6262
def validate_limit(nobs: int | None, limit=...) -> int: ...
63+
def get_fill_indexer(
64+
mask: npt.NDArray[np.bool_],
65+
limit: int | None = None,
66+
) -> npt.NDArray[np.intp]: ...
6367
def pad(
6468
old: np.ndarray, # ndarray[numeric_object_t]
6569
new: np.ndarray, # ndarray[numeric_object_t]

pandas/_libs/algos.pyx

+36
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,42 @@ def validate_limit(nobs: int | None, limit=None) -> int:
525525
return lim
526526

527527

528+
# TODO: overlap with libgroupby.group_fillna_indexer?
529+
@cython.boundscheck(False)
530+
@cython.wraparound(False)
531+
def get_fill_indexer(const uint8_t[:] mask, limit=None):
532+
"""
533+
Find an indexer to use for ffill to `take` on the array being filled.
534+
"""
535+
cdef:
536+
ndarray[intp_t, ndim=1] indexer
537+
Py_ssize_t i, N = len(mask), last_valid
538+
int lim
539+
540+
# fill_count is the number of consecutive NAs we have seen.
541+
# If it exceeds the given limit, we stop padding.
542+
int fill_count = 0
543+
544+
lim = validate_limit(N, limit)
545+
indexer = np.empty(N, dtype=np.intp)
546+
547+
last_valid = -1 # haven't yet seen anything non-NA
548+
549+
for i in range(N):
550+
if not mask[i]:
551+
indexer[i] = i
552+
last_valid = i
553+
fill_count = 0
554+
else:
555+
if fill_count < lim:
556+
indexer[i] = last_valid
557+
else:
558+
indexer[i] = -1
559+
fill_count += 1
560+
561+
return indexer
562+
563+
528564
@cython.boundscheck(False)
529565
@cython.wraparound(False)
530566
def pad(

pandas/core/arrays/arrow/array.py

-3
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,6 @@
6767

6868
from pandas.core.dtypes.dtypes import ArrowDtype
6969

70-
from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning
71-
7270
ARROW_CMP_FUNCS = {
7371
"eq": pc.equal,
7472
"ne": pc.not_equal,
@@ -918,7 +916,6 @@ def fillna(
918916
return super().fillna(value=value, method=method, limit=limit)
919917

920918
if method is not None:
921-
fallback_performancewarning()
922919
return super().fillna(value=value, method=method, limit=limit)
923920

924921
if isinstance(value, (np.ndarray, ExtensionArray)):

pandas/core/arrays/base.py

+14-5
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,10 @@
2323

2424
import numpy as np
2525

26-
from pandas._libs import lib
26+
from pandas._libs import (
27+
algos as libalgos,
28+
lib,
29+
)
2730
from pandas.compat import set_function_name
2831
from pandas.compat.numpy import function as nv
2932
from pandas.errors import AbstractMethodError
@@ -824,10 +827,16 @@ def fillna(
824827

825828
if mask.any():
826829
if method is not None:
827-
func = missing.get_fill_func(method)
828-
npvalues = self.astype(object)
829-
func(npvalues, limit=limit, mask=mask)
830-
new_values = self._from_sequence(npvalues, dtype=self.dtype)
830+
meth = missing.clean_fill_method(method)
831+
832+
npmask = np.asarray(mask)
833+
if meth == "pad":
834+
indexer = libalgos.get_fill_indexer(npmask, limit=limit)
835+
return self.take(indexer, allow_fill=True)
836+
else:
837+
# i.e. meth == "backfill"
838+
indexer = libalgos.get_fill_indexer(npmask[::-1], limit=limit)[::-1]
839+
return self[::-1].take(indexer, allow_fill=True)
831840
else:
832841
# fill with value
833842
new_values = self.copy()

pandas/tests/extension/test_arrow.py

-7
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@
3838
pa_version_under9p0,
3939
pa_version_under11p0,
4040
)
41-
from pandas.errors import PerformanceWarning
4241

4342
from pandas.core.dtypes.dtypes import (
4443
ArrowDtype,
@@ -698,12 +697,6 @@ def test_fillna_no_op_returns_copy(self, data):
698697
assert result is not data
699698
self.assert_extension_array_equal(result, data)
700699

701-
def test_fillna_series_method(self, data_missing, fillna_method):
702-
with tm.maybe_produces_warning(
703-
PerformanceWarning, fillna_method is not None, check_stacklevel=False
704-
):
705-
super().test_fillna_series_method(data_missing, fillna_method)
706-
707700

708701
class TestBasePrinting(base.BasePrintingTests):
709702
pass

pandas/tests/extension/test_string.py

-11
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,7 @@
1818
import numpy as np
1919
import pytest
2020

21-
from pandas.errors import PerformanceWarning
22-
2321
import pandas as pd
24-
import pandas._testing as tm
2522
from pandas.api.types import is_string_dtype
2623
from pandas.core.arrays import ArrowStringArray
2724
from pandas.core.arrays.string_ import StringDtype
@@ -169,14 +166,6 @@ def test_fillna_no_op_returns_copy(self, data):
169166
assert result is not data
170167
self.assert_extension_array_equal(result, data)
171168

172-
def test_fillna_series_method(self, data_missing, fillna_method):
173-
with tm.maybe_produces_warning(
174-
PerformanceWarning,
175-
fillna_method is not None and data_missing.dtype.storage == "pyarrow",
176-
check_stacklevel=False,
177-
):
178-
super().test_fillna_series_method(data_missing, fillna_method)
179-
180169

181170
class TestNoReduce(base.BaseNoReduceTests):
182171
@pytest.mark.parametrize("skipna", [True, False])

0 commit comments

Comments
 (0)