From 659895f290afe8ceafcecceba3c9bfbbe741a0fb Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 11 Jun 2020 13:47:28 +0100 Subject: [PATCH 1/2] interpolate_1d returns function --- pandas/core/internals/blocks.py | 26 ++--- pandas/core/missing.py | 199 ++++++++++++++------------------ 2 files changed, 98 insertions(+), 127 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index e2a778f729470..c390d48ee23d9 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1194,22 +1194,16 @@ def _interpolate( ) # process 1-d slices in the axis direction - def func(yvalues: np.ndarray) -> np.ndarray: - - # process a 1-d slice, returning it - # should the axis argument be handled below in apply_along_axis? - # i.e. not an arg to missing.interpolate_1d - return missing.interpolate_1d( - xvalues=index, - yvalues=yvalues, - method=method, - limit=limit, - limit_direction=limit_direction, - limit_area=limit_area, - fill_value=fill_value, - bounds_error=False, - **kwargs, - ) + func = missing.interpolate_1d( + xvalues=index, + method=method, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + fill_value=fill_value, + bounds_error=False, + **kwargs, + ) # interp each column independently interp_values = np.apply_along_axis(func, axis, data) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index d8671616f944e..b0b256ca0a3ff 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -2,7 +2,7 @@ Routines for filling missing data. """ -from typing import Any, List, Optional, Set, Union +from typing import Any, Callable, List, Optional, Set, Union import numpy as np @@ -96,26 +96,23 @@ def clean_fill_method(method, allow_nearest=False): def clean_interp_method(method: str, **kwargs) -> str: order = kwargs.get("order") - valid = [ - "linear", - "time", - "index", - "values", + sp_methods = [ "nearest", "zero", "slinear", "quadratic", "cubic", "barycentric", - "polynomial", "krogh", + "spline", + "polynomial", + "from_derivatives", "piecewise_polynomial", "pchip", "akima", - "spline", - "from_derivatives", "cubicspline", ] + valid = ["linear", "time", "index", "values"] + sp_methods if method in ("spline", "polynomial") and order is None: raise ValueError("You must specify the order of the spline or polynomial.") if method not in valid: @@ -163,7 +160,6 @@ def find_valid_index(values, how: str): def interpolate_1d( xvalues: np.ndarray, - yvalues: np.ndarray, method: Optional[str] = "linear", limit: Optional[int] = None, limit_direction: str = "forward", @@ -172,7 +168,7 @@ def interpolate_1d( bounds_error: bool = False, order: Optional[int] = None, **kwargs, -): +) -> Callable[[np.ndarray], np.ndarray]: """ Logic for the 1-d interpolation. The result should be 1-d, inputs xvalues and yvalues will each be 1-d arrays of the same length. @@ -182,19 +178,6 @@ def interpolate_1d( """ # Treat the original, non-scipy methods first. - invalid = isna(yvalues) - valid = ~invalid - - if not valid.any(): - # have to call np.asarray(xvalues) since xvalues could be an Index - # which can't be mutated - result = np.empty_like(np.asarray(xvalues), dtype=np.float64) - result.fill(np.nan) - return result - - if valid.all(): - return yvalues - if method == "time": if not getattr(xvalues, "is_all_dates", None): # if not issubclass(xvalues.dtype.type, np.datetime64): @@ -225,98 +208,92 @@ def interpolate_1d( # default limit is unlimited GH #16282 limit = algos._validate_limit(nobs=None, limit=limit) - # These are sets of index pointers to invalid values... i.e. {0, 1, etc... - all_nans = set(np.flatnonzero(invalid)) - start_nans = set(range(find_valid_index(yvalues, "first"))) - end_nans = set(range(1 + find_valid_index(yvalues, "last"), len(valid))) - mid_nans = all_nans - start_nans - end_nans - - # Like the sets above, preserve_nans contains indices of invalid values, - # but in this case, it is the final set of indices that need to be - # preserved as NaN after the interpolation. - - # For example if limit_direction='forward' then preserve_nans will - # contain indices of NaNs at the beginning of the series, and NaNs that - # are more than'limit' away from the prior non-NaN. - - # set preserve_nans based on direction using _interp_limit - preserve_nans: Union[List, Set] - if limit_direction == "forward": - preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0)) - elif limit_direction == "backward": - preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit)) - else: - # both directions... just use _interp_limit - preserve_nans = set(_interp_limit(invalid, limit, limit)) - - # if limit_area is set, add either mid or outside indices - # to preserve_nans GH #16284 - if limit_area == "inside": - # preserve NaNs on the outside - preserve_nans |= start_nans | end_nans - elif limit_area == "outside": - # preserve NaNs on the inside - preserve_nans |= mid_nans - - # sort preserve_nans and covert to list - preserve_nans = sorted(preserve_nans) - xvalues = getattr(xvalues, "values", xvalues) - yvalues = getattr(yvalues, "values", yvalues) - result = yvalues.copy() - - if method in ["linear", "time", "index", "values"]: - if method in ("values", "index"): - inds = np.asarray(xvalues) - # hack for DatetimeIndex, #1646 - if needs_i8_conversion(inds.dtype): - inds = inds.view(np.int64) - if inds.dtype == np.object_: - inds = lib.maybe_convert_objects(inds) + + inds = np.asarray(xvalues) + + # hack for DatetimeIndex, #1646 + if method != "linear" and needs_i8_conversion(inds.dtype): + inds = inds.view(np.int64) + + if method in ("values", "index"): + if inds.dtype == np.object_: + inds = lib.maybe_convert_objects(inds) + + def func(yvalues: np.ndarray) -> np.ndarray: + invalid = isna(yvalues) + valid = ~invalid + + if not valid.any(): + # have to call np.asarray(xvalues) since xvalues could be an Index + # which can't be mutated + result = np.empty_like(np.asarray(xvalues), dtype=np.float64) + result.fill(np.nan) + return result + + if valid.all(): + return yvalues + + # These are sets of index pointers to invalid values... i.e. {0, 1, etc... + all_nans = set(np.flatnonzero(invalid)) + start_nans = set(range(find_valid_index(yvalues, "first"))) + end_nans = set(range(1 + find_valid_index(yvalues, "last"), len(valid))) + mid_nans = all_nans - start_nans - end_nans + + # Like the sets above, preserve_nans contains indices of invalid values, + # but in this case, it is the final set of indices that need to be + # preserved as NaN after the interpolation. + + # For example if limit_direction='forward' then preserve_nans will + # contain indices of NaNs at the beginning of the series, and NaNs that + # are more than'limit' away from the prior non-NaN. + + # set preserve_nans based on direction using _interp_limit + preserve_nans: Union[List, Set] + if limit_direction == "forward": + preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0)) + elif limit_direction == "backward": + preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit)) else: - inds = xvalues - # np.interp requires sorted X values, #21037 - indexer = np.argsort(inds[valid]) - result[invalid] = np.interp( - inds[invalid], inds[valid][indexer], yvalues[valid][indexer] - ) + # both directions... just use _interp_limit + preserve_nans = set(_interp_limit(invalid, limit, limit)) + + # if limit_area is set, add either mid or outside indices + # to preserve_nans GH #16284 + if limit_area == "inside": + # preserve NaNs on the outside + preserve_nans |= start_nans | end_nans + elif limit_area == "outside": + # preserve NaNs on the inside + preserve_nans |= mid_nans + + # sort preserve_nans and covert to list + preserve_nans = sorted(preserve_nans) + + yvalues = getattr(yvalues, "values", yvalues) + result = yvalues.copy() + + if method in ["linear", "index", "values"]: + # np.interp requires sorted X values, #21037 + indexer = np.argsort(inds[valid]) + result[invalid] = np.interp( + inds[invalid], inds[valid][indexer], yvalues[valid][indexer] + ) + else: + result[invalid] = _interpolate_scipy_wrapper( + inds[valid], + yvalues[valid], + inds[invalid], + method=method, + fill_value=fill_value, + bounds_error=bounds_error, + order=order, + **kwargs, + ) result[preserve_nans] = np.nan return result - sp_methods = [ - "nearest", - "zero", - "slinear", - "quadratic", - "cubic", - "barycentric", - "krogh", - "spline", - "polynomial", - "from_derivatives", - "piecewise_polynomial", - "pchip", - "akima", - "cubicspline", - ] - - if method in sp_methods: - inds = np.asarray(xvalues) - # hack for DatetimeIndex, #1646 - if issubclass(inds.dtype.type, np.datetime64): - inds = inds.view(np.int64) - result[invalid] = _interpolate_scipy_wrapper( - inds[valid], - yvalues[valid], - inds[invalid], - method=method, - fill_value=fill_value, - bounds_error=bounds_error, - order=order, - **kwargs, - ) - result[preserve_nans] = np.nan - return result + return func def _interpolate_scipy_wrapper( From 19e2bdb6d33a8bba34b278e29b4ced7e807f8f2e Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 13 Jun 2020 11:43:58 +0100 Subject: [PATCH 2/2] CLN: clean and deduplicate in core.missing.interpolate_1d --- pandas/core/internals/blocks.py | 26 ++-- pandas/core/missing.py | 217 +++++++++++++++++--------------- 2 files changed, 130 insertions(+), 113 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index c390d48ee23d9..e2a778f729470 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1194,16 +1194,22 @@ def _interpolate( ) # process 1-d slices in the axis direction - func = missing.interpolate_1d( - xvalues=index, - method=method, - limit=limit, - limit_direction=limit_direction, - limit_area=limit_area, - fill_value=fill_value, - bounds_error=False, - **kwargs, - ) + def func(yvalues: np.ndarray) -> np.ndarray: + + # process a 1-d slice, returning it + # should the axis argument be handled below in apply_along_axis? + # i.e. not an arg to missing.interpolate_1d + return missing.interpolate_1d( + xvalues=index, + yvalues=yvalues, + method=method, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + fill_value=fill_value, + bounds_error=False, + **kwargs, + ) # interp each column independently interp_values = np.apply_along_axis(func, axis, data) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index b0b256ca0a3ff..7802c5cbdbfb3 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -2,7 +2,7 @@ Routines for filling missing data. """ -from typing import Any, Callable, List, Optional, Set, Union +from typing import Any, List, Optional, Set, Union import numpy as np @@ -94,27 +94,37 @@ def clean_fill_method(method, allow_nearest=False): return method +# interpolation methods that dispatch to np.interp + +NP_METHODS = ["linear", "time", "index", "values"] + +# interpolation methods that dispatch to _interpolate_scipy_wrapper + +SP_METHODS = [ + "nearest", + "zero", + "slinear", + "quadratic", + "cubic", + "barycentric", + "krogh", + "spline", + "polynomial", + "from_derivatives", + "piecewise_polynomial", + "pchip", + "akima", + "cubicspline", +] + + def clean_interp_method(method: str, **kwargs) -> str: order = kwargs.get("order") - sp_methods = [ - "nearest", - "zero", - "slinear", - "quadratic", - "cubic", - "barycentric", - "krogh", - "spline", - "polynomial", - "from_derivatives", - "piecewise_polynomial", - "pchip", - "akima", - "cubicspline", - ] - valid = ["linear", "time", "index", "values"] + sp_methods + if method in ("spline", "polynomial") and order is None: raise ValueError("You must specify the order of the spline or polynomial.") + + valid = NP_METHODS + SP_METHODS if method not in valid: raise ValueError(f"method must be one of {valid}. Got '{method}' instead.") @@ -160,6 +170,7 @@ def find_valid_index(values, how: str): def interpolate_1d( xvalues: np.ndarray, + yvalues: np.ndarray, method: Optional[str] = "linear", limit: Optional[int] = None, limit_direction: str = "forward", @@ -168,7 +179,7 @@ def interpolate_1d( bounds_error: bool = False, order: Optional[int] = None, **kwargs, -) -> Callable[[np.ndarray], np.ndarray]: +): """ Logic for the 1-d interpolation. The result should be 1-d, inputs xvalues and yvalues will each be 1-d arrays of the same length. @@ -176,7 +187,18 @@ def interpolate_1d( Bounds_error is currently hardcoded to False since non-scipy ones don't take it as an argument. """ - # Treat the original, non-scipy methods first. + invalid = isna(yvalues) + valid = ~invalid + + if not valid.any(): + # have to call np.asarray(xvalues) since xvalues could be an Index + # which can't be mutated + result = np.empty_like(np.asarray(xvalues), dtype=np.float64) + result.fill(np.nan) + return result + + if valid.all(): + return yvalues if method == "time": if not getattr(xvalues, "is_all_dates", None): @@ -208,92 +230,81 @@ def interpolate_1d( # default limit is unlimited GH #16282 limit = algos._validate_limit(nobs=None, limit=limit) + # These are sets of index pointers to invalid values... i.e. {0, 1, etc... + all_nans = set(np.flatnonzero(invalid)) + start_nans = set(range(find_valid_index(yvalues, "first"))) + end_nans = set(range(1 + find_valid_index(yvalues, "last"), len(valid))) + mid_nans = all_nans - start_nans - end_nans + + # Like the sets above, preserve_nans contains indices of invalid values, + # but in this case, it is the final set of indices that need to be + # preserved as NaN after the interpolation. + + # For example if limit_direction='forward' then preserve_nans will + # contain indices of NaNs at the beginning of the series, and NaNs that + # are more than'limit' away from the prior non-NaN. + + # set preserve_nans based on direction using _interp_limit + preserve_nans: Union[List, Set] + if limit_direction == "forward": + preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0)) + elif limit_direction == "backward": + preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit)) + else: + # both directions... just use _interp_limit + preserve_nans = set(_interp_limit(invalid, limit, limit)) + + # if limit_area is set, add either mid or outside indices + # to preserve_nans GH #16284 + if limit_area == "inside": + # preserve NaNs on the outside + preserve_nans |= start_nans | end_nans + elif limit_area == "outside": + # preserve NaNs on the inside + preserve_nans |= mid_nans + + # sort preserve_nans and covert to list + preserve_nans = sorted(preserve_nans) + + yvalues = getattr(yvalues, "values", yvalues) + result = yvalues.copy() + + # xvalues to pass to NumPy/SciPy + xvalues = getattr(xvalues, "values", xvalues) + if method == "linear": + inds = xvalues + else: + inds = np.asarray(xvalues) - inds = np.asarray(xvalues) - - # hack for DatetimeIndex, #1646 - if method != "linear" and needs_i8_conversion(inds.dtype): - inds = inds.view(np.int64) - - if method in ("values", "index"): - if inds.dtype == np.object_: - inds = lib.maybe_convert_objects(inds) - - def func(yvalues: np.ndarray) -> np.ndarray: - invalid = isna(yvalues) - valid = ~invalid - - if not valid.any(): - # have to call np.asarray(xvalues) since xvalues could be an Index - # which can't be mutated - result = np.empty_like(np.asarray(xvalues), dtype=np.float64) - result.fill(np.nan) - return result - - if valid.all(): - return yvalues - - # These are sets of index pointers to invalid values... i.e. {0, 1, etc... - all_nans = set(np.flatnonzero(invalid)) - start_nans = set(range(find_valid_index(yvalues, "first"))) - end_nans = set(range(1 + find_valid_index(yvalues, "last"), len(valid))) - mid_nans = all_nans - start_nans - end_nans - - # Like the sets above, preserve_nans contains indices of invalid values, - # but in this case, it is the final set of indices that need to be - # preserved as NaN after the interpolation. - - # For example if limit_direction='forward' then preserve_nans will - # contain indices of NaNs at the beginning of the series, and NaNs that - # are more than'limit' away from the prior non-NaN. - - # set preserve_nans based on direction using _interp_limit - preserve_nans: Union[List, Set] - if limit_direction == "forward": - preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0)) - elif limit_direction == "backward": - preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit)) - else: - # both directions... just use _interp_limit - preserve_nans = set(_interp_limit(invalid, limit, limit)) - - # if limit_area is set, add either mid or outside indices - # to preserve_nans GH #16284 - if limit_area == "inside": - # preserve NaNs on the outside - preserve_nans |= start_nans | end_nans - elif limit_area == "outside": - # preserve NaNs on the inside - preserve_nans |= mid_nans - - # sort preserve_nans and covert to list - preserve_nans = sorted(preserve_nans) - - yvalues = getattr(yvalues, "values", yvalues) - result = yvalues.copy() - - if method in ["linear", "index", "values"]: - # np.interp requires sorted X values, #21037 - indexer = np.argsort(inds[valid]) - result[invalid] = np.interp( - inds[invalid], inds[valid][indexer], yvalues[valid][indexer] - ) - else: - result[invalid] = _interpolate_scipy_wrapper( - inds[valid], - yvalues[valid], - inds[invalid], - method=method, - fill_value=fill_value, - bounds_error=bounds_error, - order=order, - **kwargs, - ) - result[preserve_nans] = np.nan - return result + # hack for DatetimeIndex, #1646 + if needs_i8_conversion(inds.dtype): + inds = inds.view(np.int64) + + if method in ("values", "index"): + if inds.dtype == np.object_: + inds = lib.maybe_convert_objects(inds) + + if method in NP_METHODS: + # np.interp requires sorted X values, #21037 + indexer = np.argsort(inds[valid]) + result[invalid] = np.interp( + inds[invalid], inds[valid][indexer], yvalues[valid][indexer] + ) + else: + result[invalid] = _interpolate_scipy_wrapper( + inds[valid], + yvalues[valid], + inds[invalid], + method=method, + fill_value=fill_value, + bounds_error=bounds_error, + order=order, + **kwargs, + ) - return func + result[preserve_nans] = np.nan + return result def _interpolate_scipy_wrapper(