Skip to content

Commit 65bca65

Browse files
authored
PERF: DataFrame.first_valid_index/last_valid_index for EA dtypes (#51549)
1 parent dff66e3 commit 65bca65

File tree

4 files changed

+37
-13
lines changed

4 files changed

+37
-13
lines changed

asv_bench/benchmarks/frame_methods.py

+25
Original file line numberDiff line numberDiff line change
@@ -754,4 +754,29 @@ def time_memory_usage_object_dtype(self):
754754
self.df2.memory_usage(deep=True)
755755

756756

757+
class FindValidIndex:
758+
param_names = ["dtype"]
759+
params = [
760+
["float", "Float64", "float64[pyarrow]"],
761+
]
762+
763+
def setup(self, dtype):
764+
df = DataFrame(
765+
np.random.randn(100000, 2),
766+
columns=list("AB"),
767+
dtype=dtype,
768+
)
769+
df.iloc[:100, 0] = None
770+
df.iloc[:200, 1] = None
771+
df.iloc[-100:, 0] = None
772+
df.iloc[-200:, 1] = None
773+
self.df = df
774+
775+
def time_first_valid_index(self, dtype):
776+
self.df.first_valid_index()
777+
778+
def time_last_valid_index(self, dtype):
779+
self.df.last_valid_index()
780+
781+
757782
from .pandas_vb_common import setup # noqa: F401 isort:skip

doc/source/whatsnew/v2.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ Deprecations
100100

101101
Performance improvements
102102
~~~~~~~~~~~~~~~~~~~~~~~~
103+
- Performance improvement in :meth:`DataFrame.first_valid_index` and :meth:`DataFrame.last_valid_index` for extension array dtypes (:issue:`51549`)
103104
- Performance improvement in :meth:`DataFrame.clip` and :meth:`Series.clip` (:issue:`51472`)
104105
-
105106

pandas/core/generic.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -11861,7 +11861,8 @@ def _find_valid_index(self, *, how: str) -> Hashable | None:
1186111861
-------
1186211862
idx_first_valid : type of index
1186311863
"""
11864-
idxpos = find_valid_index(self._values, how=how, is_valid=~isna(self._values))
11864+
is_valid = self.notna().values
11865+
idxpos = find_valid_index(how=how, is_valid=is_valid)
1186511866
if idxpos is None:
1186611867
return None
1186711868
return self.index[idxpos]

pandas/core/missing.py

+9-12
Original file line numberDiff line numberDiff line change
@@ -183,15 +183,12 @@ def clean_interp_method(method: str, index: Index, **kwargs) -> str:
183183
return method
184184

185185

186-
def find_valid_index(
187-
values, *, how: str, is_valid: npt.NDArray[np.bool_]
188-
) -> int | None:
186+
def find_valid_index(how: str, is_valid: npt.NDArray[np.bool_]) -> int | None:
189187
"""
190-
Retrieves the index of the first valid value.
188+
Retrieves the positional index of the first valid value.
191189
192190
Parameters
193191
----------
194-
values : ndarray or ExtensionArray
195192
how : {'first', 'last'}
196193
Use this parameter to change between the first or last valid index.
197194
is_valid: np.ndarray
@@ -203,17 +200,17 @@ def find_valid_index(
203200
"""
204201
assert how in ["first", "last"]
205202

206-
if len(values) == 0: # early stop
203+
if len(is_valid) == 0: # early stop
207204
return None
208205

209-
if values.ndim == 2:
206+
if is_valid.ndim == 2:
210207
is_valid = is_valid.any(axis=1) # reduce axis 1
211208

212209
if how == "first":
213210
idxpos = is_valid[::].argmax()
214211

215212
elif how == "last":
216-
idxpos = len(values) - 1 - is_valid[::-1].argmax()
213+
idxpos = len(is_valid) - 1 - is_valid[::-1].argmax()
217214

218215
chk_notna = is_valid[idxpos]
219216

@@ -417,12 +414,12 @@ def _interpolate_1d(
417414
# These are sets of index pointers to invalid values... i.e. {0, 1, etc...
418415
all_nans = set(np.flatnonzero(invalid))
419416

420-
first_valid_index = find_valid_index(yvalues, how="first", is_valid=valid)
417+
first_valid_index = find_valid_index(how="first", is_valid=valid)
421418
if first_valid_index is None: # no nan found in start
422419
first_valid_index = 0
423420
start_nans = set(range(first_valid_index))
424421

425-
last_valid_index = find_valid_index(yvalues, how="last", is_valid=valid)
422+
last_valid_index = find_valid_index(how="last", is_valid=valid)
426423
if last_valid_index is None: # no nan found in end
427424
last_valid_index = len(yvalues)
428425
end_nans = set(range(1 + last_valid_index, len(valid)))
@@ -766,10 +763,10 @@ def _interpolate_with_limit_area(
766763
is_valid = ~invalid
767764

768765
if not invalid.all():
769-
first = find_valid_index(values, how="first", is_valid=is_valid)
766+
first = find_valid_index(how="first", is_valid=is_valid)
770767
if first is None:
771768
first = 0
772-
last = find_valid_index(values, how="last", is_valid=is_valid)
769+
last = find_valid_index(how="last", is_valid=is_valid)
773770
if last is None:
774771
last = len(values)
775772

0 commit comments

Comments
 (0)