From f10d969d0113530293ca9ec02e4e7a63cf644e97 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 6 Jun 2020 18:19:06 +0100 Subject: [PATCH 1/4] WIP: avoid internals for Series.interpolate --- pandas/core/internals/blocks.py | 7 +-- pandas/core/missing.py | 7 ++- pandas/core/series.py | 90 +++++++++++++++++++++++++++++++++ 3 files changed, 97 insertions(+), 7 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index e70c8f9d5f09a..90f7956686a02 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1110,7 +1110,7 @@ def check_int_bool(self, inplace): downcast=downcast, ) # validate the interp method - m = missing.clean_interp_method(method, **kwargs) + m = missing.clean_interp_method(method, index, **kwargs) r = check_int_bool(self, inplace) if r is not None: @@ -1193,11 +1193,6 @@ def _interpolate( if fill_value is None: fill_value = self.fill_value - if method in ("krogh", "piecewise_polynomial", "pchip"): - if not index.is_monotonic: - raise ValueError( - f"{method} interpolation requires that the index be monotonic." - ) # process 1-d slices in the axis direction def func(x): diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 79bbef5fa5505..8ee55a6dd9641 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -92,7 +92,7 @@ def clean_fill_method(method, allow_nearest=False): return method -def clean_interp_method(method, **kwargs): +def clean_interp_method(method, index, **kwargs): order = kwargs.get("order") valid = [ "linear", @@ -118,6 +118,11 @@ def clean_interp_method(method, **kwargs): raise ValueError("You must specify the order of the spline or polynomial.") if method not in valid: raise ValueError(f"method must be one of {valid}. Got '{method}' instead.") + if method in ("krogh", "piecewise_polynomial", "pchip"): + if not index.is_monotonic: + raise ValueError( + f"{method} interpolation requires that the index be monotonic." + ) return method diff --git a/pandas/core/series.py b/pandas/core/series.py index 6b5ed86027806..0175c6ead62e5 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -40,19 +40,23 @@ from pandas.core.dtypes.cast import ( convert_dtypes, maybe_cast_to_extension_array, + maybe_downcast_to_dtype, validate_numeric_casting, ) from pandas.core.dtypes.common import ( ensure_platform_int, is_bool, is_categorical_dtype, + is_datetime64_any_dtype, is_dict_like, is_extension_array_dtype, is_integer, is_iterator, is_list_like, + is_numeric_dtype, is_object_dtype, is_scalar, + is_timedelta64_dtype, ) from pandas.core.dtypes.generic import ABCDataFrame from pandas.core.dtypes.inference import is_hashable @@ -92,6 +96,7 @@ from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.core.indexing import check_bool_indexer from pandas.core.internals import SingleBlockManager +import pandas.core.missing as missing from pandas.core.sorting import ensure_key_mapped from pandas.core.strings import StringMethods from pandas.core.tools.datetimes import to_datetime @@ -2222,6 +2227,91 @@ def quantile(self, q=0.5, interpolation="linear"): # scalar return result.iloc[0] + def interpolate( + self, + method="linear", + axis=0, + limit=None, + inplace=False, + limit_direction="forward", + limit_area=None, + downcast=None, + **kwargs, + ): + """ + Interpolate values according to different methods. + """ + inplace = validate_bool_kwarg(inplace, "inplace") + + axis = self._get_axis_number(axis) + + if isinstance(self.index, MultiIndex) and method != "linear": + raise ValueError( + "Only `method=linear` interpolation is supported on MultiIndexes." + ) + + # for the methods backfill, bfill, pad, ffill limit_direction and limit_area + # are being ignored, see gh-26796 for more information + if method in ["backfill", "bfill", "pad", "ffill"]: + return self.fillna( + method=method, + axis=axis, + inplace=inplace, + limit=limit, + downcast=downcast, + ) + + # TODO: get x values from index could be helper function or shared with + # DataFrame () + index = self.index + if method == "linear": + # prior default + index = np.arange(len(index)) + else: + methods = {"index", "values", "nearest", "time"} + is_numeric_or_datetime = ( + is_numeric_dtype(index.dtype) + or is_datetime64_any_dtype(index.dtype) + or is_timedelta64_dtype(index.dtype) + ) + if method not in methods and not is_numeric_or_datetime: + raise ValueError( + "Index column must be numeric or datetime type when " + f"using {method} method other than linear. " + "Try setting a numeric or datetime index column before " + "interpolating." + ) + + if isna(index).any(): + raise NotImplementedError( + "Interpolation with NaNs in the index " + "has not been implemented. Try filling " + "those NaNs before interpolating." + ) + + method = missing.clean_interp_method(method, index, **kwargs) + + arr = missing.interpolate_1d( + xvalues=index, + yvalues=self.values, + method=method, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + bounds_error=False, + **kwargs, + ) + + if downcast is not None: + arr = maybe_downcast_to_dtype(arr, dtype=downcast) + + result = self._constructor(arr, index=self.index) + + if inplace: + return self._update_inplace(result) + else: + return result.__finalize__(self, method="interpolate") + def corr(self, other, method="pearson", min_periods=None) -> float: """ Compute correlation with `other` Series, excluding missing values. From c42e965913ccf468cba58cd7dfa927262aa33f5b Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 7 Jun 2020 10:12:06 +0100 Subject: [PATCH 2/4] deduplicate and typing --- pandas/core/generic.py | 52 +++++--------------- pandas/core/internals/blocks.py | 64 ++++++++++-------------- pandas/core/missing.py | 87 ++++++++++++++++++++++++--------- pandas/core/series.py | 64 ++++++------------------ 4 files changed, 118 insertions(+), 149 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4b074924baaf2..2be96b679070e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6863,16 +6863,16 @@ def replace( @Appender(_shared_docs["interpolate"] % _shared_doc_kwargs) def interpolate( - self, - method="linear", - axis=0, - limit=None, - inplace=False, - limit_direction="forward", - limit_area=None, - downcast=None, + self: FrameOrSeries, + method: str = "linear", + axis: Axis = 0, + limit: Optional[int] = None, + inplace: bool_t = False, + limit_direction: str = "forward", + limit_area: Optional[str] = None, + downcast: Optional[str] = None, **kwargs, - ): + ) -> Optional[FrameOrSeries]: """ Interpolate values according to different methods. """ @@ -6881,11 +6881,6 @@ def interpolate( axis = self._get_axis_number(axis) index = self._get_axis(axis) - if isinstance(self.index, MultiIndex) and method != "linear": - raise ValueError( - "Only `method=linear` interpolation is supported on MultiIndexes." - ) - # for the methods backfill, bfill, pad, ffill limit_direction and limit_area # are being ignored, see gh-26796 for more information if method in ["backfill", "bfill", "pad", "ffill"]: @@ -6904,39 +6899,14 @@ def interpolate( else: df = self.T - if self.ndim == 2 and np.all(self.dtypes == np.dtype(object)): + if np.all(self.dtypes == np.dtype(object)): raise TypeError( "Cannot interpolate with all object-dtype columns " "in the DataFrame. Try setting at least one " "column to a numeric dtype." ) - if method == "linear": - # prior default - index = np.arange(len(df.index)) - else: - methods = {"index", "values", "nearest", "time"} - is_numeric_or_datetime = ( - is_numeric_dtype(index.dtype) - or is_datetime64_any_dtype(index.dtype) - or is_timedelta64_dtype(index.dtype) - ) - if method not in methods and not is_numeric_or_datetime: - raise ValueError( - "Index column must be numeric or datetime type when " - f"using {method} method other than linear. " - "Try setting a numeric or datetime index column before " - "interpolating." - ) - - if isna(index).any(): - raise NotImplementedError( - "Interpolation with NaNs in the index " - "has not been implemented. Try filling " - "those NaNs before interpolating." - ) - data = df._mgr - new_data = data.interpolate( + new_data = df._mgr.interpolate( method=method, axis=self._info_axis_number, index=index, diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 90f7956686a02..5ffbb001f5a47 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1,7 +1,7 @@ from datetime import datetime, timedelta import inspect import re -from typing import Any, List +from typing import Any, List, Optional import warnings import numpy as np @@ -1081,14 +1081,9 @@ def interpolate( inplace = validate_bool_kwarg(inplace, "inplace") - def check_int_bool(self, inplace): - # Only FloatBlocks will contain NaNs. - # timedelta subclasses IntBlock - if (self.is_bool or self.is_integer) and not self.is_timedelta: - if inplace: - return self - else: - return self.copy() + # Only FloatBlocks will contain NaNs. timedelta subclasses IntBlock + if (self.is_bool or self.is_integer) and not self.is_timedelta: + return self if inplace else self.copy() # a fill na type method try: @@ -1097,9 +1092,6 @@ def check_int_bool(self, inplace): m = None if m is not None: - r = check_int_bool(self, inplace) - if r is not None: - return r return self._interpolate_with_fill( method=m, axis=axis, @@ -1109,15 +1101,13 @@ def check_int_bool(self, inplace): coerce=coerce, downcast=downcast, ) - # validate the interp method - m = missing.clean_interp_method(method, index, **kwargs) - r = check_int_bool(self, inplace) - if r is not None: - return r + # validate the interp method and get xvalues + m, xvalues = missing.clean_interp_method(method, index, **kwargs) + return self._interpolate( method=m, - index=index, + xvalues=xvalues, axis=axis, limit=limit, limit_direction=limit_direction, @@ -1130,13 +1120,13 @@ def check_int_bool(self, inplace): def _interpolate_with_fill( self, - method="pad", - axis=0, - inplace=False, - limit=None, - fill_value=None, - coerce=False, - downcast=None, + method: str = "pad", + axis: int = 0, + inplace: bool = False, + limit: Optional[str] = None, + fill_value: Optional[Any] = None, + coerce: bool = False, + downcast: Optional[str] = None, ) -> List["Block"]: """ fillna but using the interpolate machinery """ inplace = validate_bool_kwarg(inplace, "inplace") @@ -1169,15 +1159,15 @@ def _interpolate_with_fill( def _interpolate( self, - method=None, - index=None, - fill_value=None, - axis=0, - limit=None, - limit_direction="forward", - limit_area=None, - inplace=False, - downcast=None, + method: Optional[str] = None, + xvalues: Optional[np.ndarray] = None, + fill_value: Optional[Any] = None, + axis: int = 0, + limit: Optional[int] = None, + limit_direction: str = "forward", + limit_area: Optional[str] = None, + inplace: bool = False, + downcast: Optional[str] = None, **kwargs, ) -> List["Block"]: """ interpolate using scipy wrappers """ @@ -1195,14 +1185,14 @@ def _interpolate( # process 1-d slices in the axis direction - def func(x): + def func(yvalues: np.ndarray) -> np.ndarray: # process a 1-d slice, returning it # should the axis argument be handled below in apply_along_axis? # i.e. not an arg to missing.interpolate_1d return missing.interpolate_1d( - index, - x, + xvalues=xvalues, + yvalues=yvalues, method=method, limit=limit, limit_direction=limit_direction, diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 8ee55a6dd9641..baa4b32de3e41 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -2,6 +2,8 @@ Routines for filling missing data. """ +from typing import TYPE_CHECKING, Any, List, Optional, Set, Tuple, Union + import numpy as np from pandas._libs import algos, lib @@ -10,16 +12,22 @@ from pandas.core.dtypes.cast import infer_dtype_from_array from pandas.core.dtypes.common import ( ensure_float64, + is_datetime64_any_dtype, is_datetime64_dtype, is_datetime64tz_dtype, is_integer_dtype, + is_numeric_dtype, is_numeric_v_string_like, is_scalar, is_timedelta64_dtype, needs_i8_conversion, ) +from pandas.core.dtypes.generic import ABCMultiIndex from pandas.core.dtypes.missing import isna +if TYPE_CHECKING: + from pandas import Index + def mask_missing(arr, values_to_mask): """ @@ -92,8 +100,16 @@ def clean_fill_method(method, allow_nearest=False): return method -def clean_interp_method(method, index, **kwargs): - order = kwargs.get("order") +def clean_interp_method( + method: str, index: "Index", order: Optional[int] = None, **kwargs +) -> Tuple[str, np.ndarray]: + """ + Validate Index and order keyword for interpolation methods. + + Returns + ------- + tuple of str, np.ndarray + """ valid = [ "linear", "time", @@ -123,8 +139,44 @@ def clean_interp_method(method, index, **kwargs): raise ValueError( f"{method} interpolation requires that the index be monotonic." ) + elif method == "time": + if not getattr(index, "is_all_dates", None): + raise ValueError( + "time-weighted interpolation only works " + "on Series or DataFrames with a DatetimeIndex" + ) + method = "values" - return method + if method == "linear": + xvalues = np.arange(len(index)) + else: + if isinstance(index, ABCMultiIndex): + raise ValueError( + "Only `method=linear` interpolation is supported on MultiIndexes." + ) + + methods = {"index", "values", "nearest", "time"} + is_numeric_or_datetime = ( + is_numeric_dtype(index.dtype) + or is_datetime64_any_dtype(index.dtype) + or is_timedelta64_dtype(index.dtype) + ) + if method not in methods and not is_numeric_or_datetime: + raise ValueError( + "Index column must be numeric or datetime type when " + f"using {method} method other than linear. " + "Try setting a numeric or datetime index column before " + "interpolating." + ) + if isna(index).any(): + raise NotImplementedError( + "Interpolation with NaNs in the index " + "has not been implemented. Try filling " + "those NaNs before interpolating." + ) + xvalues = index.values + + return method, xvalues def find_valid_index(values, how: str): @@ -165,15 +217,15 @@ def find_valid_index(values, how: str): def interpolate_1d( - xvalues, - yvalues, - method="linear", - limit=None, - limit_direction="forward", - limit_area=None, - fill_value=None, - bounds_error=False, - order=None, + xvalues: np.ndarray, + yvalues: np.ndarray, + method: Optional[str] = "linear", + limit: Optional[int] = None, + limit_direction: str = "forward", + limit_area: Optional[str] = None, + fill_value: Optional[Any] = None, + bounds_error: bool = False, + order: Optional[int] = None, **kwargs, ): """ @@ -198,16 +250,6 @@ def interpolate_1d( if valid.all(): return yvalues - if method == "time": - if not getattr(xvalues, "is_all_dates", None): - # if not issubclass(xvalues.dtype.type, np.datetime64): - raise ValueError( - "time-weighted interpolation only works " - "on Series or DataFrames with a " - "DatetimeIndex" - ) - method = "values" - valid_limit_directions = ["forward", "backward", "both"] limit_direction = limit_direction.lower() if limit_direction not in valid_limit_directions: @@ -243,6 +285,7 @@ def interpolate_1d( # are more than'limit' away from the prior non-NaN. # set preserve_nans based on direction using _interp_limit + preserve_nans: Union[List, Set] if limit_direction == "forward": preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0)) elif limit_direction == "backward": diff --git a/pandas/core/series.py b/pandas/core/series.py index 0175c6ead62e5..6ead13911cc85 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -47,16 +47,13 @@ ensure_platform_int, is_bool, is_categorical_dtype, - is_datetime64_any_dtype, is_dict_like, is_extension_array_dtype, is_integer, is_iterator, is_list_like, - is_numeric_dtype, is_object_dtype, is_scalar, - is_timedelta64_dtype, ) from pandas.core.dtypes.generic import ABCDataFrame from pandas.core.dtypes.inference import is_hashable @@ -2229,15 +2226,15 @@ def quantile(self, q=0.5, interpolation="linear"): def interpolate( self, - method="linear", - axis=0, - limit=None, - inplace=False, - limit_direction="forward", - limit_area=None, - downcast=None, + method: str = "linear", + axis: Axis = 0, + limit: Optional[int] = None, + inplace: bool = False, + limit_direction: str = "forward", + limit_area: Optional[str] = None, + downcast: Optional[str] = None, **kwargs, - ): + ) -> Optional["Series"]: """ Interpolate values according to different methods. """ @@ -2245,11 +2242,6 @@ def interpolate( axis = self._get_axis_number(axis) - if isinstance(self.index, MultiIndex) and method != "linear": - raise ValueError( - "Only `method=linear` interpolation is supported on MultiIndexes." - ) - # for the methods backfill, bfill, pad, ffill limit_direction and limit_area # are being ignored, see gh-26796 for more information if method in ["backfill", "bfill", "pad", "ffill"]: @@ -2261,39 +2253,12 @@ def interpolate( downcast=downcast, ) - # TODO: get x values from index could be helper function or shared with - # DataFrame () - index = self.index - if method == "linear": - # prior default - index = np.arange(len(index)) - else: - methods = {"index", "values", "nearest", "time"} - is_numeric_or_datetime = ( - is_numeric_dtype(index.dtype) - or is_datetime64_any_dtype(index.dtype) - or is_timedelta64_dtype(index.dtype) - ) - if method not in methods and not is_numeric_or_datetime: - raise ValueError( - "Index column must be numeric or datetime type when " - f"using {method} method other than linear. " - "Try setting a numeric or datetime index column before " - "interpolating." - ) - - if isna(index).any(): - raise NotImplementedError( - "Interpolation with NaNs in the index " - "has not been implemented. Try filling " - "those NaNs before interpolating." - ) - - method = missing.clean_interp_method(method, index, **kwargs) + # validate the interp method and get xvalues + method, xvalues = missing.clean_interp_method(method, self.index, **kwargs) arr = missing.interpolate_1d( - xvalues=index, - yvalues=self.values, + xvalues, + self.values, method=method, limit=limit, limit_direction=limit_direction, @@ -2305,10 +2270,11 @@ def interpolate( if downcast is not None: arr = maybe_downcast_to_dtype(arr, dtype=downcast) - result = self._constructor(arr, index=self.index) + result = self._constructor(arr, index=self.index, fastpath=True) if inplace: - return self._update_inplace(result) + self._update_inplace(result) + return None else: return result.__finalize__(self, method="interpolate") From 9df9e7e147280b6491b1ad924001f89dbb600fe3 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 7 Jun 2020 10:59:25 +0100 Subject: [PATCH 3/4] docstring --- pandas/core/series.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 6ead13911cc85..3906e1817efe1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2224,6 +2224,7 @@ def quantile(self, q=0.5, interpolation="linear"): # scalar return result.iloc[0] + @doc(NDFrame.interpolate) def interpolate( self, method: str = "linear", @@ -2235,9 +2236,6 @@ def interpolate( downcast: Optional[str] = None, **kwargs, ) -> Optional["Series"]: - """ - Interpolate values according to different methods. - """ inplace = validate_bool_kwarg(inplace, "inplace") axis = self._get_axis_number(axis) From b88bc748e386fdb8a64f60d4f26fed2b122b6948 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 7 Jun 2020 12:56:21 +0100 Subject: [PATCH 4/4] pass index to _interpolate --- pandas/core/internals/blocks.py | 42 ++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 5ffbb001f5a47..31b38941c0153 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1,7 +1,7 @@ from datetime import datetime, timedelta import inspect import re -from typing import Any, List, Optional +from typing import TYPE_CHECKING, Any, List, Optional import warnings import numpy as np @@ -83,6 +83,9 @@ import pandas.core.missing as missing from pandas.core.nanops import nanpercentile +if TYPE_CHECKING: + from pandas import Index + class Block(PandasObject): """ @@ -1066,16 +1069,16 @@ def coerce_to_target_dtype(self, other): def interpolate( self, - method="pad", - axis=0, - index=None, - inplace=False, - limit=None, - limit_direction="forward", - limit_area=None, - fill_value=None, - coerce=False, - downcast=None, + method: str = "pad", + axis: int = 0, + index: Optional["Index"] = None, + inplace: bool = False, + limit: Optional[int] = None, + limit_direction: str = "forward", + limit_area: Optional[str] = None, + fill_value: Optional[Any] = None, + coerce: bool = False, + downcast: Optional[str] = None, **kwargs, ): @@ -1102,12 +1105,10 @@ def interpolate( downcast=downcast, ) - # validate the interp method and get xvalues - m, xvalues = missing.clean_interp_method(method, index, **kwargs) - + assert index is not None # for mypy return self._interpolate( - method=m, - xvalues=xvalues, + method=method, + index=index, axis=axis, limit=limit, limit_direction=limit_direction, @@ -1123,7 +1124,7 @@ def _interpolate_with_fill( method: str = "pad", axis: int = 0, inplace: bool = False, - limit: Optional[str] = None, + limit: Optional[int] = None, fill_value: Optional[Any] = None, coerce: bool = False, downcast: Optional[str] = None, @@ -1159,8 +1160,8 @@ def _interpolate_with_fill( def _interpolate( self, - method: Optional[str] = None, - xvalues: Optional[np.ndarray] = None, + method: str, + index: "Index", fill_value: Optional[Any] = None, axis: int = 0, limit: Optional[int] = None, @@ -1174,6 +1175,9 @@ def _interpolate( inplace = validate_bool_kwarg(inplace, "inplace") data = self.values if inplace else self.values.copy() + # validate the interp method and get xvalues + method, xvalues = missing.clean_interp_method(method, index, **kwargs) + # only deal with floats if not self.is_float: if not self.is_integer: