Skip to content

Commit 3888a3f

Browse files
PERF: avoid object conversion in fillna(method=pad|backfill) for masked arrays (#39953)
1 parent d17476d commit 3888a3f

File tree

14 files changed

+139
-31
lines changed

14 files changed

+139
-31
lines changed

asv_bench/benchmarks/frame_methods.py

+37-9
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
date_range,
1212
isnull,
1313
period_range,
14+
timedelta_range,
1415
)
1516

1617
from .pandas_vb_common import tm
@@ -355,15 +356,42 @@ def time_isnull_obj(self):
355356

356357
class Fillna:
357358

358-
params = ([True, False], ["pad", "bfill"])
359-
param_names = ["inplace", "method"]
360-
361-
def setup(self, inplace, method):
362-
values = np.random.randn(10000, 100)
363-
values[::2] = np.nan
364-
self.df = DataFrame(values)
365-
366-
def time_frame_fillna(self, inplace, method):
359+
params = (
360+
[True, False],
361+
["pad", "bfill"],
362+
[
363+
"float64",
364+
"float32",
365+
"object",
366+
"Int64",
367+
"Float64",
368+
"datetime64[ns]",
369+
"datetime64[ns, tz]",
370+
"timedelta64[ns]",
371+
],
372+
)
373+
param_names = ["inplace", "method", "dtype"]
374+
375+
def setup(self, inplace, method, dtype):
376+
N, M = 10000, 100
377+
if dtype in ("datetime64[ns]", "datetime64[ns, tz]", "timedelta64[ns]"):
378+
data = {
379+
"datetime64[ns]": date_range("2011-01-01", freq="H", periods=N),
380+
"datetime64[ns, tz]": date_range(
381+
"2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
382+
),
383+
"timedelta64[ns]": timedelta_range(start="1 day", periods=N, freq="1D"),
384+
}
385+
self.df = DataFrame({f"col_{i}": data[dtype] for i in range(M)})
386+
self.df[::2] = None
387+
else:
388+
values = np.random.randn(N, M)
389+
values[::2] = np.nan
390+
if dtype == "Int64":
391+
values = values.round()
392+
self.df = DataFrame(values, dtype=dtype)
393+
394+
def time_frame_fillna(self, inplace, method, dtype):
367395
self.df.fillna(inplace=inplace, method=method)
368396

369397

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,7 @@ Performance improvements
375375
- Performance improvement in :meth:`IntervalIndex.isin` (:issue:`38353`)
376376
- Performance improvement in :meth:`Series.mean` for nullable data types (:issue:`34814`)
377377
- Performance improvement in :meth:`Series.isin` for nullable data types (:issue:`38340`)
378+
- Performance improvement in :meth:`DataFrame.fillna` with ``method="pad|backfill"`` for nullable floating and nullable integer dtypes (:issue:`39953`)
378379
- Performance improvement in :meth:`DataFrame.corr` for method=kendall (:issue:`28329`)
379380
- Performance improvement in :meth:`core.window.rolling.Rolling.corr` and :meth:`core.window.rolling.Rolling.cov` (:issue:`39388`)
380381
- Performance improvement in :meth:`core.window.rolling.RollingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` (:issue:`39591`)

pandas/_libs/algos.pyx

+10-2
Original file line numberDiff line numberDiff line change
@@ -597,10 +597,11 @@ def pad(ndarray[algos_t] old, ndarray[algos_t] new, limit=None):
597597

598598
@cython.boundscheck(False)
599599
@cython.wraparound(False)
600-
def pad_inplace(algos_t[:] values, const uint8_t[:] mask, limit=None):
600+
def pad_inplace(algos_t[:] values, uint8_t[:] mask, limit=None):
601601
cdef:
602602
Py_ssize_t i, N
603603
algos_t val
604+
uint8_t prev_mask
604605
int lim, fill_count = 0
605606

606607
N = len(values)
@@ -612,15 +613,18 @@ def pad_inplace(algos_t[:] values, const uint8_t[:] mask, limit=None):
612613
lim = validate_limit(N, limit)
613614

614615
val = values[0]
616+
prev_mask = mask[0]
615617
for i in range(N):
616618
if mask[i]:
617619
if fill_count >= lim:
618620
continue
619621
fill_count += 1
620622
values[i] = val
623+
mask[i] = prev_mask
621624
else:
622625
fill_count = 0
623626
val = values[i]
627+
prev_mask = mask[i]
624628

625629

626630
@cython.boundscheck(False)
@@ -739,10 +743,11 @@ def backfill(ndarray[algos_t] old, ndarray[algos_t] new, limit=None) -> ndarray:
739743

740744
@cython.boundscheck(False)
741745
@cython.wraparound(False)
742-
def backfill_inplace(algos_t[:] values, const uint8_t[:] mask, limit=None):
746+
def backfill_inplace(algos_t[:] values, uint8_t[:] mask, limit=None):
743747
cdef:
744748
Py_ssize_t i, N
745749
algos_t val
750+
uint8_t prev_mask
746751
int lim, fill_count = 0
747752

748753
N = len(values)
@@ -754,15 +759,18 @@ def backfill_inplace(algos_t[:] values, const uint8_t[:] mask, limit=None):
754759
lim = validate_limit(N, limit)
755760

756761
val = values[N - 1]
762+
prev_mask = mask[N - 1]
757763
for i in range(N - 1, -1, -1):
758764
if mask[i]:
759765
if fill_count >= lim:
760766
continue
761767
fill_count += 1
762768
values[i] = val
769+
mask[i] = prev_mask
763770
else:
764771
fill_count = 0
765772
val = values[i]
773+
prev_mask = mask[i]
766774

767775

768776
@cython.boundscheck(False)

pandas/core/arrays/_mixins.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,7 @@ def fillna(
279279
if mask.any():
280280
if method is not None:
281281
func = missing.get_fill_func(method)
282-
new_values = func(self._ndarray.copy(), limit=limit, mask=mask)
282+
new_values, _ = func(self._ndarray.copy(), limit=limit, mask=mask)
283283
# TODO: PandasArray didn't used to copy, need tests for this
284284
new_values = self._from_backing_data(new_values)
285285
else:

pandas/core/arrays/base.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -702,7 +702,7 @@ def fillna(self, value=None, method=None, limit=None):
702702
if mask.any():
703703
if method is not None:
704704
func = missing.get_fill_func(method)
705-
new_values = func(self.astype(object), limit=limit, mask=mask)
705+
new_values, _ = func(self.astype(object), limit=limit, mask=mask)
706706
new_values = self._from_sequence(new_values, dtype=self.dtype)
707707
else:
708708
# fill with value

pandas/core/arrays/masked.py

+39-1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
cache_readonly,
2929
doc,
3030
)
31+
from pandas.util._validators import validate_fillna_kwargs
3132

3233
from pandas.core.dtypes.base import ExtensionDtype
3334
from pandas.core.dtypes.common import (
@@ -38,12 +39,16 @@
3839
is_string_dtype,
3940
pandas_dtype,
4041
)
42+
from pandas.core.dtypes.inference import is_array_like
4143
from pandas.core.dtypes.missing import (
4244
isna,
4345
notna,
4446
)
4547

46-
from pandas.core import nanops
48+
from pandas.core import (
49+
missing,
50+
nanops,
51+
)
4752
from pandas.core.algorithms import (
4853
factorize_array,
4954
isin,
@@ -144,6 +149,39 @@ def __getitem__(
144149

145150
return type(self)(self._data[item], self._mask[item])
146151

152+
@doc(ExtensionArray.fillna)
153+
def fillna(
154+
self: BaseMaskedArrayT, value=None, method=None, limit=None
155+
) -> BaseMaskedArrayT:
156+
value, method = validate_fillna_kwargs(value, method)
157+
158+
mask = self._mask
159+
160+
if is_array_like(value):
161+
if len(value) != len(self):
162+
raise ValueError(
163+
f"Length of 'value' does not match. Got ({len(value)}) "
164+
f" expected {len(self)}"
165+
)
166+
value = value[mask]
167+
168+
if mask.any():
169+
if method is not None:
170+
func = missing.get_fill_func(method)
171+
new_values, new_mask = func(
172+
self._data.copy(),
173+
limit=limit,
174+
mask=mask.copy(),
175+
)
176+
return type(self)(new_values, new_mask.view(np.bool_))
177+
else:
178+
# fill with value
179+
new_values = self.copy()
180+
new_values[mask] = value
181+
else:
182+
new_values = self.copy()
183+
return new_values
184+
147185
def _coerce_to_array(self, values) -> Tuple[np.ndarray, np.ndarray]:
148186
raise AbstractMethodError(self)
149187

pandas/core/arrays/string_arrow.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -400,7 +400,7 @@ def fillna(self, value=None, method=None, limit=None):
400400
if mask.any():
401401
if method is not None:
402402
func = missing.get_fill_func(method)
403-
new_values = func(self.to_numpy(object), limit=limit, mask=mask)
403+
new_values, _ = func(self.to_numpy(object), limit=limit, mask=mask)
404404
new_values = self._from_sequence(new_values)
405405
else:
406406
# fill with value

pandas/core/internals/blocks.py

+2-5
Original file line numberDiff line numberDiff line change
@@ -1727,16 +1727,13 @@ def _slice(self, slicer):
17271727
def fillna(
17281728
self, value, limit=None, inplace: bool = False, downcast=None
17291729
) -> List[Block]:
1730-
values = self.values if inplace else self.values.copy()
1731-
values = values.fillna(value=value, limit=limit)
1730+
values = self.values.fillna(value=value, limit=limit)
17321731
return [self.make_block_same_class(values=values)]
17331732

17341733
def interpolate(
17351734
self, method="pad", axis=0, inplace=False, limit=None, fill_value=None, **kwargs
17361735
):
1737-
1738-
values = self.values if inplace else self.values.copy()
1739-
new_values = values.fillna(value=fill_value, method=method, limit=limit)
1736+
new_values = self.values.fillna(value=fill_value, method=method, limit=limit)
17401737
return self.make_block_same_class(new_values)
17411738

17421739
def diff(self, n: int, axis: int = 1) -> List[Block]:

pandas/core/missing.py

+18-10
Original file line numberDiff line numberDiff line change
@@ -660,9 +660,9 @@ def interpolate_2d(
660660
method = clean_fill_method(method)
661661
tvalues = transf(values)
662662
if method == "pad":
663-
result = _pad_2d(tvalues, limit=limit)
663+
result, _ = _pad_2d(tvalues, limit=limit)
664664
else:
665-
result = _backfill_2d(tvalues, limit=limit)
665+
result, _ = _backfill_2d(tvalues, limit=limit)
666666

667667
result = transf(result)
668668
# reshape back
@@ -698,26 +698,34 @@ def new_func(values, limit=None, mask=None):
698698
# This needs to occur before casting to int64
699699
mask = isna(values)
700700

701-
result = func(values.view("i8"), limit=limit, mask=mask)
702-
return result.view(values.dtype)
701+
result, mask = func(values.view("i8"), limit=limit, mask=mask)
702+
return result.view(values.dtype), mask
703703

704704
return func(values, limit=limit, mask=mask)
705705

706706
return cast(F, new_func)
707707

708708

709709
@_datetimelike_compat
710-
def _pad_1d(values, limit=None, mask=None):
710+
def _pad_1d(
711+
values: np.ndarray,
712+
limit: int | None = None,
713+
mask: np.ndarray | None = None,
714+
) -> tuple[np.ndarray, np.ndarray]:
711715
mask = _fillna_prep(values, mask)
712716
algos.pad_inplace(values, mask, limit=limit)
713-
return values
717+
return values, mask
714718

715719

716720
@_datetimelike_compat
717-
def _backfill_1d(values, limit=None, mask=None):
721+
def _backfill_1d(
722+
values: np.ndarray,
723+
limit: int | None = None,
724+
mask: np.ndarray | None = None,
725+
) -> tuple[np.ndarray, np.ndarray]:
718726
mask = _fillna_prep(values, mask)
719727
algos.backfill_inplace(values, mask, limit=limit)
720-
return values
728+
return values, mask
721729

722730

723731
@_datetimelike_compat
@@ -729,7 +737,7 @@ def _pad_2d(values, limit=None, mask=None):
729737
else:
730738
# for test coverage
731739
pass
732-
return values
740+
return values, mask
733741

734742

735743
@_datetimelike_compat
@@ -741,7 +749,7 @@ def _backfill_2d(values, limit=None, mask=None):
741749
else:
742750
# for test coverage
743751
pass
744-
return values
752+
return values, mask
745753

746754

747755
_fill_methods = {"pad": _pad_1d, "backfill": _backfill_1d}

pandas/core/series.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -4529,7 +4529,7 @@ def _replace_single(self, to_replace, method: str, inplace: bool, limit):
45294529
fill_f = missing.get_fill_func(method)
45304530

45314531
mask = missing.mask_missing(result.values, to_replace)
4532-
values = fill_f(result.values, limit=limit, mask=mask)
4532+
values, _ = fill_f(result.values, limit=limit, mask=mask)
45334533

45344534
if values.dtype == orig_dtype and inplace:
45354535
return

pandas/tests/extension/base/missing.py

+12
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,18 @@ def test_fillna_limit_backfill(self, data_missing):
6969
expected = pd.Series(data_missing.take([1, 0, 1, 1, 1]))
7070
self.assert_series_equal(result, expected)
7171

72+
def test_fillna_no_op_returns_copy(self, data):
73+
data = data[~data.isna()]
74+
75+
valid = data[0]
76+
result = data.fillna(valid)
77+
assert result is not data
78+
self.assert_extension_array_equal(result, data)
79+
80+
result = data.fillna(method="backfill")
81+
assert result is not data
82+
self.assert_extension_array_equal(result, data)
83+
7284
def test_fillna_series(self, data_missing):
7385
fill_value = data_missing[1]
7486
ser = pd.Series(data_missing)

pandas/tests/extension/test_interval.py

+4
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,10 @@ def test_fillna_series_method(self):
132132
def test_fillna_limit_backfill(self):
133133
pass
134134

135+
@unsupported_fill
136+
def test_fillna_no_op_returns_copy(self):
137+
pass
138+
135139
@unsupported_fill
136140
def test_fillna_series(self):
137141
pass

pandas/tests/extension/test_numpy.py

+5
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,11 @@ def test_fillna_scalar(self, data_missing):
309309
# Non-scalar "scalar" values.
310310
super().test_fillna_scalar(data_missing)
311311

312+
@skip_nested
313+
def test_fillna_no_op_returns_copy(self, data):
314+
# Non-scalar "scalar" values.
315+
super().test_fillna_no_op_returns_copy(data)
316+
312317
@skip_nested
313318
def test_fillna_series(self, data_missing):
314319
# Non-scalar "scalar" values.

pandas/tests/extension/test_sparse.py

+7
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,13 @@ def test_fillna_limit_backfill(self, data_missing):
221221
with tm.assert_produces_warning(PerformanceWarning):
222222
super().test_fillna_limit_backfill(data_missing)
223223

224+
def test_fillna_no_op_returns_copy(self, data, request):
225+
if np.isnan(data.fill_value):
226+
request.node.add_marker(
227+
pytest.mark.xfail(reason="returns array with different fill value")
228+
)
229+
super().test_fillna_no_op_returns_copy(data)
230+
224231
def test_fillna_series_method(self, data_missing):
225232
with tm.assert_produces_warning(PerformanceWarning):
226233
super().test_fillna_limit_backfill(data_missing)

0 commit comments

Comments
 (0)