From 8f02636fba7f11db81f536177d16d2c86bfd6db1 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 13 Jun 2023 16:49:23 -0700 Subject: [PATCH 1/5] ENH: EA.interpolate --- doc/source/reference/extensions.rst | 1 + pandas/_typing.py | 20 +++++ pandas/core/arrays/base.py | 26 +++++++ pandas/core/arrays/datetimelike.py | 3 +- pandas/core/arrays/numpy_.py | 3 +- pandas/core/generic.py | 23 +----- pandas/core/internals/blocks.py | 111 +++++++++++----------------- pandas/core/resample.py | 4 +- 8 files changed, 99 insertions(+), 92 deletions(-) diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index b33efd388bd60..63eacc3f6d1d9 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -51,6 +51,7 @@ objects. api.extensions.ExtensionArray.factorize api.extensions.ExtensionArray.fillna api.extensions.ExtensionArray.insert + api.extensions.ExtensionArray.interpolate api.extensions.ExtensionArray.isin api.extensions.ExtensionArray.isna api.extensions.ExtensionArray.ravel diff --git a/pandas/_typing.py b/pandas/_typing.py index 9d4acbe76ba15..ffe9e6b319dfd 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -307,6 +307,26 @@ def closed(self) -> bool: # Arguments for fillna() FillnaOptions = Literal["backfill", "bfill", "ffill", "pad"] +InterpolateOptions = Literal[ + "linear", + "time", + "index", + "values", + "nearest", + "zero", + "slinear", + "quadratic", + "cubic", + "barycentric", + "polynomial", + "krogh", + "piecewise_polynomial", + "spline", + "pchip", + "akima", + "cubicspline", + "from_derivatives", +] # internals Manager = Union[ diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 27eb7994d3ccb..ff314f8fd588d 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -78,6 +78,7 @@ AxisInt, Dtype, FillnaOptions, + InterpolateOptions, NumpySorter, NumpyValueArrayLike, PositionalIndexer, @@ -90,6 +91,8 @@ npt, ) + from pandas import Index + _extension_array_shared_docs: dict[str, str] = {} @@ -118,6 +121,7 @@ class ExtensionArray: fillna equals insert + interpolate isin isna ravel @@ -155,6 +159,7 @@ class ExtensionArray: * take * copy * _concat_same_type + * interpolate A default repr displaying the type, (truncated) data, length, and dtype is provided. It can be customized or replaced by @@ -753,6 +758,27 @@ def argmax(self, skipna: bool = True) -> int: raise NotImplementedError return nargminmax(self, "argmax") + def interpolate( + self, + *, + method: InterpolateOptions, + axis: int, + index: Index | None, + limit, + limit_direction, + limit_area, + fill_value, + inplace: bool, + **kwargs, + ) -> Self: + """ + See NDFrame.interpolate.__doc__. + """ + # NB: we return type(self) even if inplace=True + raise NotImplementedError( + f"{type(self).__name__} does not implement interpolate" + ) + def fillna( self, value: object | ArrayLike | None = None, diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index ea085b3d1f6ab..f0fe0514fb8b3 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -58,6 +58,7 @@ Dtype, DtypeObj, F, + InterpolateOptions, NpDtype, PositionalIndexer2D, PositionalIndexerTuple, @@ -2233,7 +2234,7 @@ def copy(self, order: str = "C") -> Self: def interpolate( self, *, - method, + method: InterpolateOptions, axis: int, index: Index | None, limit, diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 113f22ad968bc..b305655cc2c21 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -32,6 +32,7 @@ from pandas._typing import ( AxisInt, Dtype, + InterpolateOptions, NpDtype, Scalar, Self, @@ -227,7 +228,7 @@ def _values_for_factorize(self) -> tuple[np.ndarray, float | None]: def interpolate( self, *, - method, + method: InterpolateOptions, axis: int, index: Index | None, limit, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 711e552f262ac..2375f16cf8729 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -61,6 +61,7 @@ IgnoreRaise, IndexKeyFunc, IndexLabel, + InterpolateOptions, IntervalClosedType, JSONSerializable, Level, @@ -7658,27 +7659,7 @@ def replace( @final def interpolate( self, - method: Literal[ - "linear", - "time", - "index", - "values", - "pad", - "nearest", - "zero", - "slinear", - "quadratic", - "cubic", - "barycentric", - "polynomial", - "krogh", - "piecewise_polynomial", - "spline", - "pchip", - "akima", - "cubicspline", - "from_derivatives", - ] = "linear", + method: InterpolateOptions = "linear", *, axis: Axis = 0, limit: int | None = None, diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 981e29df2c323..81ef5132badef 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -35,6 +35,7 @@ F, FillnaOptions, IgnoreRaise, + InterpolateOptions, QuantileInterpolation, Self, Shape, @@ -1345,7 +1346,7 @@ def fillna( def interpolate( self, *, - method: FillnaOptions = "pad", + method: FillnaOptions | InterpolateOptions = "pad", axis: AxisInt = 0, index: Index | None = None, inplace: bool = False, @@ -1365,17 +1366,8 @@ def interpolate( return [self.copy(deep=False)] return [self] if inplace else [self.copy()] - try: - m = missing.clean_fill_method(method) - except ValueError: - m = None - # error: Non-overlapping equality check (left operand type: - # "Literal['backfill', 'bfill', 'ffill', 'pad']", right - # operand type: "Literal['asfreq']") - if method == "asfreq": # type: ignore[comparison-overlap] - # clean_fill_method used to allow this - raise - if m is None and self.dtype == _dtype_obj: + # TODO(3.0): this case will not be reachable once GH#53638 is enforced + if not _interp_method_is_pad_or_backfill(method) and self.dtype == _dtype_obj: # only deal with floats # bc we already checked that can_hold_na, we don't have int dtype here # test_interp_basic checks that we make a copy here @@ -1407,10 +1399,11 @@ def interpolate( else: refs = self.refs - # Dispatch to the PandasArray method. - # We know self.array_values is a PandasArray bc EABlock overrides - new_values = cast(PandasArray, self.array_values).interpolate( - method=method, + # Dispatch to the EA method. + new_values = self.array_values.interpolate( + # error: Argument "method" to "interpolate" of "ExtensionArray" has + # incompatible type [...] + method=method, # type: ignore[arg-type] axis=axis, index=index, limit=limit, @@ -1420,7 +1413,7 @@ def interpolate( inplace=arr_inplace, **kwargs, ) - data = new_values._ndarray + data = extract_array(new_values, extract_numpy=True) nb = self.make_block_same_class(data, refs=refs) return nb._maybe_downcast([nb], downcast, using_cow) @@ -1841,7 +1834,8 @@ def values_for_json(self) -> np.ndarray: def interpolate( self, *, - method: FillnaOptions = "pad", + method: FillnaOptions | InterpolateOptions = "pad", + index: Index | None = None, axis: int = 0, inplace: bool = False, limit: int | None = None, @@ -1850,11 +1844,28 @@ def interpolate( **kwargs, ): values = self.values - if values.ndim == 2 and axis == 0: - # NDArrayBackedExtensionArray.fillna assumes axis=1 - new_values = values.T.fillna(value=fill_value, method=method, limit=limit).T + + if not _interp_method_is_pad_or_backfill(method): + method = cast(InterpolateOptions, method) + return super().interpolate( + method=method, + index=index, + axis=axis, + inplace=inplace, + limit=limit, + fill_value=fill_value, + using_cow=using_cow, + **kwargs, + ) else: - new_values = values.fillna(value=fill_value, method=method, limit=limit) + method = cast(FillnaOptions, method) + if values.ndim == 2 and axis == 0: + # NDArrayBackedExtensionArray.fillna assumes axis=1 + new_values = values.T.fillna( + value=fill_value, method=method, limit=limit + ).T + else: + new_values = values.fillna(value=fill_value, method=method, limit=limit) return self.make_block_same_class(new_values) @@ -2248,51 +2259,6 @@ class DatetimeLikeBlock(NDArrayBackedExtensionBlock): def values_for_json(self) -> np.ndarray: return self.values._ndarray - def interpolate( - self, - *, - method: FillnaOptions = "pad", - index: Index | None = None, - axis: int = 0, - inplace: bool = False, - limit: int | None = None, - fill_value=None, - using_cow: bool = False, - **kwargs, - ): - values = self.values - - # error: Non-overlapping equality check (left operand type: - # "Literal['backfill', 'bfill', 'ffill', 'pad']", right operand type: - # "Literal['linear']") [comparison-overlap] - if method == "linear": # type: ignore[comparison-overlap] - # TODO: GH#50950 implement for arbitrary EAs - refs = None - arr_inplace = inplace - if using_cow: - if inplace and not self.refs.has_reference(): - refs = self.refs - else: - arr_inplace = False - - new_values = self.values.interpolate( - method=method, - index=index, - axis=axis, - inplace=arr_inplace, - limit=limit, - fill_value=fill_value, - **kwargs, - ) - return self.make_block_same_class(new_values, refs=refs) - - elif values.ndim == 2 and axis == 0: - # NDArrayBackedExtensionArray.fillna assumes axis=1 - new_values = values.T.fillna(value=fill_value, method=method, limit=limit).T - else: - new_values = values.fillna(value=fill_value, method=method, limit=limit) - return self.make_block_same_class(new_values) - class DatetimeTZBlock(DatetimeLikeBlock): """implement a datetime64 block with a tz attribute""" @@ -2606,3 +2572,14 @@ def external_values(values: ArrayLike) -> ArrayLike: # TODO(CoW) we should also mark our ExtensionArrays as read-only return values + + +def _interp_method_is_pad_or_backfill(method: str) -> bool: + try: + m = missing.clean_fill_method(method) + except ValueError: + m = None + if method == "asfreq": + # clean_fill_method used to allow this + raise + return m is not None diff --git a/pandas/core/resample.py b/pandas/core/resample.py index a9faad4cbef6a..83a1d94e68647 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -86,7 +86,7 @@ AxisInt, Frequency, IndexLabel, - QuantileInterpolation, + InterpolateOptions, T, TimedeltaConvertibleTypes, TimeGrouperOrigin, @@ -834,7 +834,7 @@ def fillna(self, method, limit: int | None = None): def interpolate( self, - method: QuantileInterpolation = "linear", + method: InterpolateOptions = "linear", *, axis: Axis = 0, limit: int | None = None, From af854bc105077c6a039ad4422bd31e93ea6ae808 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 14 Jun 2023 08:29:20 -0700 Subject: [PATCH 2/5] update code check --- ci/code_checks.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index adda422296396..b7005741c5b45 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -295,6 +295,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas.api.extensions.ExtensionArray.factorize \ pandas.api.extensions.ExtensionArray.fillna \ pandas.api.extensions.ExtensionArray.insert \ + pandas.api.extensions.ExtensionArray.interpolate \ pandas.api.extensions.ExtensionArray.isin \ pandas.api.extensions.ExtensionArray.isna \ pandas.api.extensions.ExtensionArray.ravel \ From 61eaa4e6ab86a270860262722d742fa6e81303b4 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 14 Jun 2023 14:25:49 -0700 Subject: [PATCH 3/5] update docstring to appease code check --- pandas/core/arrays/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index ff314f8fd588d..f2856e8855bd2 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -772,7 +772,7 @@ def interpolate( **kwargs, ) -> Self: """ - See NDFrame.interpolate.__doc__. + See DataFrame.interpolate.__doc__. """ # NB: we return type(self) even if inplace=True raise NotImplementedError( From ffd33fc5140b024b7b094232a6fbfb58ef89e536 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 22 Jun 2023 15:44:46 -0700 Subject: [PATCH 4/5] inplace->copy --- pandas/core/arrays/base.py | 4 ++-- pandas/core/arrays/datetimelike.py | 8 ++++---- pandas/core/arrays/numpy_.py | 8 ++++---- pandas/core/internals/blocks.py | 6 +++--- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index a645df92de81a..0856e321ad190 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -768,13 +768,13 @@ def interpolate( limit_direction, limit_area, fill_value, - inplace: bool, + copy: bool, **kwargs, ) -> Self: """ See DataFrame.interpolate.__doc__. """ - # NB: we return type(self) even if inplace=True + # NB: we return type(self) even if copy=False raise NotImplementedError( f"{type(self).__name__} does not implement interpolate" ) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index f0fe0514fb8b3..4fbf8eabb3991 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -2241,17 +2241,17 @@ def interpolate( limit_direction, limit_area, fill_value, - inplace: bool, + copy: bool, **kwargs, ) -> Self: """ See NDFrame.interpolate.__doc__. """ - # NB: we return type(self) even if inplace=True + # NB: we return type(self) even if copy=False if method != "linear": raise NotImplementedError - if inplace: + if not copy: out_data = self._ndarray else: out_data = self._ndarray.copy() @@ -2267,7 +2267,7 @@ def interpolate( fill_value=fill_value, **kwargs, ) - if inplace: + if not copy: return self return type(self)._simple_new(out_data, dtype=self.dtype) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index b305655cc2c21..4d3559c195b36 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -235,14 +235,14 @@ def interpolate( limit_direction, limit_area, fill_value, - inplace: bool, + copy: bool, **kwargs, ) -> Self: """ See NDFrame.interpolate.__doc__. """ - # NB: we return type(self) even if inplace=True - if inplace: + # NB: we return type(self) even if copy=False + if not copy: out_data = self._ndarray else: out_data = self._ndarray.copy() @@ -258,7 +258,7 @@ def interpolate( fill_value=fill_value, **kwargs, ) - if inplace: + if not copy: return self return type(self)._simple_new(out_data, dtype=self.dtype) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 162c763c4245d..bac0af5c5a9be 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1393,10 +1393,10 @@ def interpolate( ) refs = None - arr_inplace = inplace + copy = not inplace if inplace: if using_cow and self.refs.has_reference(): - arr_inplace = False + copy = True else: refs = self.refs @@ -1411,7 +1411,7 @@ def interpolate( limit_direction=limit_direction, limit_area=limit_area, fill_value=fill_value, - inplace=arr_inplace, + copy=copy, **kwargs, ) data = extract_array(new_values, extract_numpy=True) From f445ae983f8f5154cf63b35b2115a878df083197 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 28 Jun 2023 09:22:17 -0700 Subject: [PATCH 5/5] whatsnew --- doc/source/whatsnew/v2.1.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 517baa648d805..50e8bb0087d73 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -111,6 +111,7 @@ Other enhancements - :meth:`Series.str.join` now supports ``ArrowDtype(pa.string())`` (:issue:`53646`) - :meth:`SeriesGroupby.agg` and :meth:`DataFrameGroupby.agg` now support passing in multiple functions for ``engine="numba"`` (:issue:`53486`) - :meth:`SeriesGroupby.transform` and :meth:`DataFrameGroupby.transform` now support passing in a string as the function for ``engine="numba"`` (:issue:`53579`) +- Added :meth:`ExtensionArray.interpolate` used by :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` (:issue:`53659`) - Added ``engine_kwargs`` parameter to :meth:`DataFrame.to_excel` (:issue:`53220`) - Added a new parameter ``by_row`` to :meth:`Series.apply`. When set to ``False`` the supplied callables will always operate on the whole Series (:issue:`53400`). - Groupby aggregations (such as :meth:`DataFrameGroupby.sum`) now can preserve the dtype of the input instead of casting to ``float64`` (:issue:`44952`) @@ -118,6 +119,7 @@ Other enhancements - Many read/to_* functions, such as :meth:`DataFrame.to_pickle` and :func:`read_csv`, support forwarding compression arguments to lzma.LZMAFile (:issue:`52979`) - Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`) - Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`) +- .. --------------------------------------------------------------------------- .. _whatsnew_210.notable_bug_fixes: