Skip to content

PERF: interpolate_1d returns function to apply columnwise #34728

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 10 additions & 16 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1194,22 +1194,16 @@ def _interpolate(
)
# process 1-d slices in the axis direction

def func(yvalues: np.ndarray) -> np.ndarray:

# process a 1-d slice, returning it
# should the axis argument be handled below in apply_along_axis?
# i.e. not an arg to missing.interpolate_1d
return missing.interpolate_1d(
xvalues=index,
yvalues=yvalues,
method=method,
limit=limit,
limit_direction=limit_direction,
limit_area=limit_area,
fill_value=fill_value,
bounds_error=False,
**kwargs,
)
func = missing.interpolate_1d(
xvalues=index,
method=method,
limit=limit,
limit_direction=limit_direction,
limit_area=limit_area,
fill_value=fill_value,
bounds_error=False,
**kwargs,
)

# interp each column independently
interp_values = np.apply_along_axis(func, axis, data)
Expand Down
150 changes: 76 additions & 74 deletions pandas/core/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Routines for filling missing data.
"""

from typing import Any, List, Optional, Set, Union
from typing import Any, Callable, List, Optional, Set, Union

import numpy as np

Expand Down Expand Up @@ -170,7 +170,6 @@ def find_valid_index(values, how: str):

def interpolate_1d(
xvalues: np.ndarray,
yvalues: np.ndarray,
method: Optional[str] = "linear",
limit: Optional[int] = None,
limit_direction: str = "forward",
Expand All @@ -179,27 +178,14 @@ def interpolate_1d(
bounds_error: bool = False,
order: Optional[int] = None,
**kwargs,
):
) -> Callable[[np.ndarray], np.ndarray]:
"""
Logic for the 1-d interpolation. The result should be 1-d, inputs
xvalues and yvalues will each be 1-d arrays of the same length.

Bounds_error is currently hardcoded to False since non-scipy ones don't
take it as an argument.
"""
invalid = isna(yvalues)
valid = ~invalid

if not valid.any():
# have to call np.asarray(xvalues) since xvalues could be an Index
# which can't be mutated
result = np.empty_like(np.asarray(xvalues), dtype=np.float64)
result.fill(np.nan)
return result

if valid.all():
return yvalues

if method == "time":
if not getattr(xvalues, "is_all_dates", None):
# if not issubclass(xvalues.dtype.type, np.datetime64):
Expand Down Expand Up @@ -230,45 +216,6 @@ def interpolate_1d(
# default limit is unlimited GH #16282
limit = algos._validate_limit(nobs=None, limit=limit)

# These are sets of index pointers to invalid values... i.e. {0, 1, etc...
all_nans = set(np.flatnonzero(invalid))
start_nans = set(range(find_valid_index(yvalues, "first")))
end_nans = set(range(1 + find_valid_index(yvalues, "last"), len(valid)))
mid_nans = all_nans - start_nans - end_nans

# Like the sets above, preserve_nans contains indices of invalid values,
# but in this case, it is the final set of indices that need to be
# preserved as NaN after the interpolation.

# For example if limit_direction='forward' then preserve_nans will
# contain indices of NaNs at the beginning of the series, and NaNs that
# are more than'limit' away from the prior non-NaN.

# set preserve_nans based on direction using _interp_limit
preserve_nans: Union[List, Set]
if limit_direction == "forward":
preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0))
elif limit_direction == "backward":
preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit))
else:
# both directions... just use _interp_limit
preserve_nans = set(_interp_limit(invalid, limit, limit))

# if limit_area is set, add either mid or outside indices
# to preserve_nans GH #16284
if limit_area == "inside":
# preserve NaNs on the outside
preserve_nans |= start_nans | end_nans
elif limit_area == "outside":
# preserve NaNs on the inside
preserve_nans |= mid_nans

# sort preserve_nans and covert to list
preserve_nans = sorted(preserve_nans)

yvalues = getattr(yvalues, "values", yvalues)
result = yvalues.copy()

# xvalues to pass to NumPy/SciPy

xvalues = getattr(xvalues, "values", xvalues)
Expand All @@ -285,26 +232,81 @@ def interpolate_1d(
if inds.dtype == np.object_:
inds = lib.maybe_convert_objects(inds)

if method in NP_METHODS:
# np.interp requires sorted X values, #21037
indexer = np.argsort(inds[valid])
result[invalid] = np.interp(
inds[invalid], inds[valid][indexer], yvalues[valid][indexer]
)
else:
result[invalid] = _interpolate_scipy_wrapper(
inds[valid],
yvalues[valid],
inds[invalid],
method=method,
fill_value=fill_value,
bounds_error=bounds_error,
order=order,
**kwargs,
)
def func(yvalues: np.ndarray) -> np.ndarray:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you make this a module level function and give it a nice name

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

doc-string as well

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it needs to be a closure

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

really? can you pass any needed arguments, this makes it really hard to grok

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the code in this function is applied columnwise, the perf gains will come from doing some things once instead of 100x for the asv.

I prefer this functional approach to pre-validation as it keeps related code together. There is some more cleaning which could move more into the outer function once it's called less often

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand, but having this an outer function with passing args should not have any impact on performance. It simply a more understandable approach. ok with merging then refactoring to be simpler later though.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

its basically the processing of the index which only needs to be done once. However, with the bulk of the time on the preserve_nans set logic, there is no sig perf gains yet, hence draft for now.

originally grouped the validation see #34628. So can do it that way if preferred.

i.e. method, xvalues = missing.clean_interp_method(method, index, **kwargs)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

kk sure, i think you have 1 PR pending that we should merge before this i think. but lmk what you prefer.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, trying not to affect the work by @cchwala, I have an alternative implementation of limit_direction that does not re-use the preserve_nans set logic, see #34628

so i'm now happy #34727 need not affect that, but still need to look at the max_gap algo

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry #34749 not #34628

invalid = isna(yvalues)
valid = ~invalid

if not valid.any():
# have to call np.asarray(xvalues) since xvalues could be an Index
# which can't be mutated
result = np.empty_like(np.asarray(xvalues), dtype=np.float64)
result.fill(np.nan)
return result

if valid.all():
return yvalues

# These are sets of index pointers to invalid values... i.e. {0, 1, etc...
all_nans = set(np.flatnonzero(invalid))
start_nans = set(range(find_valid_index(yvalues, "first")))
end_nans = set(range(1 + find_valid_index(yvalues, "last"), len(valid)))
mid_nans = all_nans - start_nans - end_nans

# Like the sets above, preserve_nans contains indices of invalid values,
# but in this case, it is the final set of indices that need to be
# preserved as NaN after the interpolation.

# For example if limit_direction='forward' then preserve_nans will
# contain indices of NaNs at the beginning of the series, and NaNs that
# are more than'limit' away from the prior non-NaN.

# set preserve_nans based on direction using _interp_limit
preserve_nans: Union[List, Set]
if limit_direction == "forward":
preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0))
elif limit_direction == "backward":
preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit))
else:
# both directions... just use _interp_limit
preserve_nans = set(_interp_limit(invalid, limit, limit))

# if limit_area is set, add either mid or outside indices
# to preserve_nans GH #16284
if limit_area == "inside":
# preserve NaNs on the outside
preserve_nans |= start_nans | end_nans
elif limit_area == "outside":
# preserve NaNs on the inside
preserve_nans |= mid_nans

# sort preserve_nans and covert to list
preserve_nans = sorted(preserve_nans)

yvalues = getattr(yvalues, "values", yvalues)
result = yvalues.copy()

if method in NP_METHODS:
# np.interp requires sorted X values, #21037
indexer = np.argsort(inds[valid])
result[invalid] = np.interp(
inds[invalid], inds[valid][indexer], yvalues[valid][indexer]
)
else:
result[invalid] = _interpolate_scipy_wrapper(
inds[valid],
yvalues[valid],
inds[invalid],
method=method,
fill_value=fill_value,
bounds_error=bounds_error,
order=order,
**kwargs,
)

result[preserve_nans] = np.nan
return result

result[preserve_nans] = np.nan
return result
return func


def _interpolate_scipy_wrapper(
Expand Down