From d6790a04145a1678841ffa929ca978815f1d5235 Mon Sep 17 00:00:00 2001 From: skwirskj Date: Fri, 19 Nov 2021 12:37:37 -0500 Subject: [PATCH 1/9] Laid out intial plans and ideas for how to handle series and DataFrame shift functions to allow iterables to be passed as the period --- pandas/core/generic.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0945193673107..bf93796248621 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9368,7 +9368,19 @@ def shift( 2020-01-07 30 33 37 2020-01-08 45 48 52 """ - if periods == 0: + + # --- [chein] #44424 temp edit --- # + + # Check if int + if type(periods) != int: + # check if instance of iter + if not isinstance(periods, iter): + try: + periods_iter = iter(periods) + except TypeError as te: + raise te + # when no col shift, return self + elif periods == 0: return self.copy() if freq is None: From d60c2d9c450e57973d8541557e4ab3fdba97355d Mon Sep 17 00:00:00 2001 From: skwirskj Date: Sun, 21 Nov 2021 16:06:53 -0500 Subject: [PATCH 2/9] ENH: Added functionality to DataFrame and Series shift functions to take in a list for periods and then return a concatenated DataFrame of each consecutive shift in the periods list --- pandas/core/frame.py | 28 ++++++++++++++++++++++++++++ pandas/core/series.py | 10 ++++++++++ 2 files changed, 38 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0960ab4a81149..8ea0b2b507d44 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5448,6 +5448,34 @@ def shift( ) -> DataFrame: axis = self._get_axis_number(axis) + # Handle the case of multiple shifts + if is_list_like(periods): + + new_df = DataFrame() + + from pandas.core.reshape.concat import concat + + for i in periods: + if not isinstance(i, int): + raise TypeError( + f"Value {i} in periods is not an integer, expected an integer" + ) + + new_df = concat( + [ + new_df, + super() + .shift(periods=i, freq=freq, axis=axis, fill_value=fill_value) + .add_suffix(f"_{i}"), + ], + axis=1, + ) + + if new_df.empty: + return self + + return new_df + ncols = len(self.columns) if axis == 1 and periods != 0 and fill_value is lib.no_default and ncols > 0: # We will infer fill_value to match the closest column diff --git a/pandas/core/series.py b/pandas/core/series.py index e0a63b8e35105..1d99c5e3a5835 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4927,6 +4927,16 @@ def _replace_single(self, to_replace, method: str, inplace: bool, limit): # error: Cannot determine type of 'shift' @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"]) # type: ignore[has-type] def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> Series: + + # Handle the case of multiple shifts + if is_list_like(periods): + if len(periods) == 0: + return self + + df = self.to_frame() + + return df.shift(periods, freq=freq, axis=axis, fill_value=fill_value) + return super().shift( periods=periods, freq=freq, axis=axis, fill_value=fill_value ) From 5cb3f9ed783a73a108b9eb3112ed2d8e28bb16fd Mon Sep 17 00:00:00 2001 From: skwirskj Date: Sun, 28 Nov 2021 15:14:11 -0500 Subject: [PATCH 3/9] TST: GH44424 Added tests for shift methods Created two new tests to test shifting data in Series and DataFrames with an iterable as the period parameter. --- pandas/tests/frame/methods/test_shift.py | 20 ++++++++++++++++++++ pandas/tests/series/methods/test_shift.py | 13 +++++++++++++ 2 files changed, 33 insertions(+) diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 9cd0b8bb5b315..0cb43ffefe5a8 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -429,3 +429,23 @@ def test_shift_axis1_categorical_columns(self): columns=ci, ) tm.assert_frame_equal(result, expected) + + def test_shift_with_iterable(self): + # GH#44424 + data = {"a": [1, 2, 3], "b": [4, 5, 6]} + shifts = [0, 1, 2] + + df = DataFrame(data) + shifted = df.shift(shifts) + + expected = DataFrame( + { + "a_0": [1, 2, 3], + "b_0": [4, 5, 6], + "a_1": [np.NaN, 1.0, 2.0], + "b_1": [np.NaN, 4.0, 5.0], + "a_2": [np.NaN, np.NaN, 1.0], + "b_2": [np.NaN, np.NaN, 4.0], + } + ) + tm.assert_frame_equal(expected, shifted) diff --git a/pandas/tests/series/methods/test_shift.py b/pandas/tests/series/methods/test_shift.py index 4fb378720d89d..4ceb2a569aa9e 100644 --- a/pandas/tests/series/methods/test_shift.py +++ b/pandas/tests/series/methods/test_shift.py @@ -14,6 +14,7 @@ offsets, ) import pandas._testing as tm +from pandas.core.frame import DataFrame from pandas.tseries.offsets import BDay @@ -376,3 +377,15 @@ def test_shift_non_writable_array(self, input_data, output_data): expected = Series(output_data, dtype="float64") tm.assert_series_equal(result, expected) + + def test_shift_with_iterable(self): + # GH#44424 + ser = Series([1, 2, 3]) + shifts = [0, 1, 2] + + shifted = ser.shift(shifts) + expected = DataFrame( + {"0_0": [1, 2, 3], "0_1": [np.NaN, 1.0, 2.0], "0_2": [np.NaN, np.NaN, 1.0]} + ) + + tm.assert_frame_equal(expected, shifted) From 7f59f5f9f2a96fdf44b850bd4465623565495fa4 Mon Sep 17 00:00:00 2001 From: skwirskj Date: Sun, 28 Nov 2021 15:54:07 -0500 Subject: [PATCH 4/9] ENH: GH#44424 Updated core/generic.py to latest master Reverted some changes that were made when originally finding a solution and then reset it to the latest version on the upstream master branch. --- pandas/core/generic.py | 286 +++++++++++------------------------------ 1 file changed, 72 insertions(+), 214 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a485049eee089..fd8af2c0cedd0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -12,6 +12,7 @@ from typing import ( TYPE_CHECKING, Any, + AnyStr, Callable, Hashable, Literal, @@ -43,7 +44,7 @@ Dtype, DtypeArg, DtypeObj, - FilePath, + FilePathOrBuffer, IndexKeyFunc, IndexLabel, JSONSerializable, @@ -57,7 +58,6 @@ TimedeltaConvertibleTypes, TimestampConvertibleTypes, ValueKeyFunc, - WriteBuffer, npt, ) from pandas.compat._optional import import_optional_dependency @@ -2332,7 +2332,7 @@ def to_excel( @doc(storage_options=_shared_docs["storage_options"]) def to_json( self, - path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, + path_or_buf: FilePathOrBuffer | None = None, orient: str | None = None, date_format: str | None = None, double_precision: int = 10, @@ -2353,10 +2353,9 @@ def to_json( Parameters ---------- - path_or_buf : str, path object, file-like object, or None, default None - String, path object (implementing os.PathLike[str]), or file-like - object implementing a write() function. If None, the result is - returned as a string. + path_or_buf : str or file handle, optional + File path or object. If not specified, the result is returned as + a string. orient : str Indication of expected JSON string format. @@ -3272,7 +3271,6 @@ def to_latex( {returns} See Also -------- - Styler.to_latex : Render a DataFrame to LaTeX with conditional formatting. DataFrame.to_string : Render a DataFrame to a console-friendly tabular output. DataFrame.to_html : Render a DataFrame as an HTML table. @@ -3282,7 +3280,7 @@ def to_latex( >>> df = pd.DataFrame(dict(name=['Raphael', 'Donatello'], ... mask=['red', 'purple'], ... weapon=['sai', 'bo staff'])) - >>> print(df.to_latex(index=False)) # doctest: +SKIP + >>> print(df.to_latex(index=False)) # doctest: +NORMALIZE_WHITESPACE \begin{{tabular}}{{lll}} \toprule name & mask & weapon \\ @@ -3292,15 +3290,6 @@ def to_latex( \bottomrule \end{{tabular}} """ - msg = ( - "In future versions `DataFrame.to_latex` is expected to utilise the base " - "implementation of `Styler.to_latex` for formatting and rendering. " - "The arguments signature may therefore change. It is recommended instead " - "to use `DataFrame.style.to_latex` which also contains additional " - "functionality." - ) - warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) - # Get defaults from the pandas config if self.ndim == 1: self = self.to_frame() @@ -3348,7 +3337,7 @@ def to_latex( @doc(storage_options=_shared_docs["storage_options"]) def to_csv( self, - path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, + path_or_buf: FilePathOrBuffer[AnyStr] | None = None, sep: str = ",", na_rep: str = "", float_format: str | None = None, @@ -3375,11 +3364,10 @@ def to_csv( Parameters ---------- - path_or_buf : str, path object, file-like object, or None, default None - String, path object (implementing os.PathLike[str]), or file-like - object implementing a write() function. If None, the result is - returned as a string. If a non-binary file object is passed, it should - be opened with `newline=''`, disabling universal newlines. If a binary + path_or_buf : str or file handle, default None + File path or object, if None is provided the result is returned as + a string. If a non-binary file object is passed, it should be opened + with `newline=''`, disabling universal newlines. If a binary file object is passed, `mode` might need to contain a `'b'`. .. versionchanged:: 1.2.0 @@ -3421,9 +3409,6 @@ def to_csv( and mode is one of {{'zip', 'gzip', 'bz2'}}, or inferred as one of the above, other entries passed as additional compression options. - If `path_or_buf` is omitted or `None` or is a file opened in text - mode, this argument is ignored and an (uncompressed) string is - returned/written. .. versionchanged:: 1.0.0 @@ -5311,9 +5296,10 @@ def sample( If weights do not sum to 1, they will be normalized to sum to 1. Missing values in the weights column will be treated as zero. Infinite values not allowed. - random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional - If int, array-like, or BitGenerator, seed for random number generator. - If np.random.RandomState or np.random.Generator, use as given. + random_state : int, array-like, BitGenerator, np.random.RandomState, + np.random.Generator, optional. If int, array-like, or BitGenerator, seed for + random number generator. If np.random.RandomState or np.random.Generator, + use as given. .. versionchanged:: 1.1.0 @@ -5402,7 +5388,7 @@ def sample( num_legs num_wings num_specimen_seen falcon 2 2 10 fish 0 0 8 - """ # noqa:E501 + """ if axis is None: axis = self._stat_axis_number @@ -5849,8 +5835,7 @@ def astype( if col_name not in self: raise KeyError( "Only a column name can be used for the " - "key in a dtype mappings argument. " - f"'{col_name}' not found in columns." + "key in a dtype mappings argument." ) # GH#44417 cast to Series so we can use .iat below, which will be @@ -7905,10 +7890,11 @@ def resample( level : str or int, optional For a MultiIndex, level (name or number) to use for resampling. `level` must be datetime-like. - origin : Timestamp or str, default 'start_day' + origin : {{'epoch', 'start', 'start_day', 'end', 'end_day'}}, Timestamp + or str, default 'start_day' The timestamp on which to adjust the grouping. The timezone of origin must match the timezone of the index. - If string, must be one of the following: + If a timestamp is not used, these values are also supported: - 'epoch': `origin` is 1970-01-01 - 'start': `origin` is the first value of the timeseries @@ -9380,19 +9366,7 @@ def shift( 2020-01-07 30 33 37 2020-01-08 45 48 52 """ - - # --- [chein] #44424 temp edit --- # - - # Check if int - if type(periods) != int: - # check if instance of iter - if not isinstance(periods, iter): - try: - periods_iter = iter(periods) - except TypeError as te: - raise te - # when no col shift, return self - elif periods == 0: + if periods == 0: return self.copy() if freq is None: @@ -10310,14 +10284,7 @@ def pct_change( return rs @final - def _agg_by_level( - self, - name: str, - axis: Axis = 0, - level: Level = 0, - skipna: bool_t = True, - **kwargs, - ): + def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs): if axis is None: raise ValueError("Must specify 'axis' when aggregating by level.") grouped = self.groupby(level=level, axis=axis, sort=False) @@ -10330,17 +10297,9 @@ def _agg_by_level( @final def _logical_func( - self, - name: str, - func, - axis: Axis = 0, - bool_only: bool_t | None = None, - skipna: bool_t = True, - level: Level | None = None, - **kwargs, - ) -> Series | bool_t: + self, name: str, func, axis=0, bool_only=None, skipna=True, level=None, **kwargs + ): nv.validate_logical_func((), kwargs, fname=name) - validate_bool_kwarg(skipna, "skipna", none_allowed=False) if level is not None: warnings.warn( "Using the level keyword in DataFrame and Series aggregations is " @@ -10371,40 +10330,18 @@ def _logical_func( filter_type="bool", ) - def any( - self, - axis: Axis = 0, - bool_only: bool_t | None = None, - skipna: bool_t = True, - level: Level | None = None, - **kwargs, - ) -> Series | bool_t: + def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): return self._logical_func( "any", nanops.nanany, axis, bool_only, skipna, level, **kwargs ) - def all( - self, - axis: Axis = 0, - bool_only: bool_t | None = None, - skipna: bool_t = True, - level: Level | None = None, - **kwargs, - ) -> Series | bool_t: + def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): return self._logical_func( "all", nanops.nanall, axis, bool_only, skipna, level, **kwargs ) @final - def _accum_func( - self, - name: str, - func, - axis: Axis | None = None, - skipna: bool_t = True, - *args, - **kwargs, - ): + def _accum_func(self, name: str, func, axis=None, skipna=True, *args, **kwargs): skipna = nv.validate_cum_func_with_skipna(skipna, args, kwargs, name) if axis is None: axis = self._stat_axis_number @@ -10428,20 +10365,20 @@ def block_accum_func(blk_values): return self._constructor(result).__finalize__(self, method=name) - def cummax(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs): + def cummax(self, axis=None, skipna=True, *args, **kwargs): return self._accum_func( "cummax", np.maximum.accumulate, axis, skipna, *args, **kwargs ) - def cummin(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs): + def cummin(self, axis=None, skipna=True, *args, **kwargs): return self._accum_func( "cummin", np.minimum.accumulate, axis, skipna, *args, **kwargs ) - def cumsum(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs): + def cumsum(self, axis=None, skipna=True, *args, **kwargs): return self._accum_func("cumsum", np.cumsum, axis, skipna, *args, **kwargs) - def cumprod(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs): + def cumprod(self, axis=None, skipna=True, *args, **kwargs): return self._accum_func("cumprod", np.cumprod, axis, skipna, *args, **kwargs) @final @@ -10449,15 +10386,14 @@ def _stat_function_ddof( self, name: str, func, - axis: Axis | None = None, - skipna: bool_t = True, - level: Level | None = None, - ddof: int = 1, - numeric_only: bool_t | None = None, + axis=None, + skipna=True, + level=None, + ddof=1, + numeric_only=None, **kwargs, - ) -> Series | float: + ): nv.validate_stat_ddof_func((), kwargs, fname=name) - validate_bool_kwarg(skipna, "skipna", none_allowed=False) if axis is None: axis = self._stat_axis_number if level is not None: @@ -10476,40 +10412,22 @@ def _stat_function_ddof( ) def sem( - self, - axis: Axis | None = None, - skipna: bool_t = True, - level: Level | None = None, - ddof: int = 1, - numeric_only: bool_t | None = None, - **kwargs, - ) -> Series | float: + self, axis=None, skipna=True, level=None, ddof=1, numeric_only=None, **kwargs + ): return self._stat_function_ddof( "sem", nanops.nansem, axis, skipna, level, ddof, numeric_only, **kwargs ) def var( - self, - axis: Axis | None = None, - skipna: bool_t = True, - level: Level | None = None, - ddof: int = 1, - numeric_only: bool_t | None = None, - **kwargs, - ) -> Series | float: + self, axis=None, skipna=True, level=None, ddof=1, numeric_only=None, **kwargs + ): return self._stat_function_ddof( "var", nanops.nanvar, axis, skipna, level, ddof, numeric_only, **kwargs ) def std( - self, - axis: Axis | None = None, - skipna: bool_t = True, - level: Level | None = None, - ddof: int = 1, - numeric_only: bool_t | None = None, - **kwargs, - ) -> Series | float: + self, axis=None, skipna=True, level=None, ddof=1, numeric_only=None, **kwargs + ): return self._stat_function_ddof( "std", nanops.nanstd, axis, skipna, level, ddof, numeric_only, **kwargs ) @@ -10519,19 +10437,16 @@ def _stat_function( self, name: str, func, - axis: Axis | None = None, - skipna: bool_t = True, - level: Level | None = None, - numeric_only: bool_t | None = None, + axis=None, + skipna=True, + level=None, + numeric_only=None, **kwargs, ): if name == "median": nv.validate_median((), kwargs) else: nv.validate_stat_func((), kwargs, fname=name) - - validate_bool_kwarg(skipna, "skipna", none_allowed=False) - if axis is None: axis = self._stat_axis_number if level is not None: @@ -10549,74 +10464,32 @@ def _stat_function( func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only ) - def min( - self, - axis: Axis | None = None, - skipna: bool_t = True, - level: Level | None = None, - numeric_only: bool_t | None = None, - **kwargs, - ): + def min(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs): return self._stat_function( "min", nanops.nanmin, axis, skipna, level, numeric_only, **kwargs ) - def max( - self, - axis: Axis | None = None, - skipna: bool_t = True, - level: Level | None = None, - numeric_only: bool_t | None = None, - **kwargs, - ): + def max(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs): return self._stat_function( "max", nanops.nanmax, axis, skipna, level, numeric_only, **kwargs ) - def mean( - self, - axis: Axis | None = None, - skipna: bool_t = True, - level: Level | None = None, - numeric_only: bool_t | None = None, - **kwargs, - ) -> Series | float: + def mean(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs): return self._stat_function( "mean", nanops.nanmean, axis, skipna, level, numeric_only, **kwargs ) - def median( - self, - axis: Axis | None = None, - skipna: bool_t = True, - level: Level | None = None, - numeric_only: bool_t | None = None, - **kwargs, - ) -> Series | float: + def median(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs): return self._stat_function( "median", nanops.nanmedian, axis, skipna, level, numeric_only, **kwargs ) - def skew( - self, - axis: Axis | None = None, - skipna: bool_t = True, - level: Level | None = None, - numeric_only: bool_t | None = None, - **kwargs, - ) -> Series | float: + def skew(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs): return self._stat_function( "skew", nanops.nanskew, axis, skipna, level, numeric_only, **kwargs ) - def kurt( - self, - axis: Axis | None = None, - skipna: bool_t = True, - level: Level | None = None, - numeric_only: bool_t | None = None, - **kwargs, - ) -> Series | float: + def kurt(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs): return self._stat_function( "kurt", nanops.nankurt, axis, skipna, level, numeric_only, **kwargs ) @@ -10628,11 +10501,11 @@ def _min_count_stat_function( self, name: str, func, - axis: Axis | None = None, - skipna: bool_t = True, - level: Level | None = None, - numeric_only: bool_t | None = None, - min_count: int = 0, + axis=None, + skipna=True, + level=None, + numeric_only=None, + min_count=0, **kwargs, ): if name == "sum": @@ -10641,9 +10514,6 @@ def _min_count_stat_function( nv.validate_prod((), kwargs) else: nv.validate_stat_func((), kwargs, fname=name) - - validate_bool_kwarg(skipna, "skipna", none_allowed=False) - if axis is None: axis = self._stat_axis_number if level is not None: @@ -10673,10 +10543,10 @@ def _min_count_stat_function( def sum( self, - axis: Axis | None = None, - skipna: bool_t = True, - level: Level | None = None, - numeric_only: bool_t | None = None, + axis=None, + skipna=True, + level=None, + numeric_only=None, min_count=0, **kwargs, ): @@ -10686,11 +10556,11 @@ def sum( def prod( self, - axis: Axis | None = None, - skipna: bool_t = True, - level: Level | None = None, - numeric_only: bool_t | None = None, - min_count: int = 0, + axis=None, + skipna=True, + level=None, + numeric_only=None, + min_count=0, **kwargs, ): return self._min_count_stat_function( @@ -10706,12 +10576,7 @@ def prod( product = prod - def mad( - self, - axis: Axis | None = None, - skipna: bool_t = True, - level: Level | None = None, - ) -> Series | float: + def mad(self, axis=None, skipna=None, level=None): """ {desc} @@ -10719,7 +10584,7 @@ def mad( ---------- axis : {axis_descr} Axis for the function to be applied on. - skipna : bool, default True + skipna : bool, default None Exclude NA/null values when computing the result. level : int or level name, default None If the axis is a MultiIndex (hierarchical), count along a @@ -10731,14 +10596,7 @@ def mad( {see_also}\ {examples} """ - if not is_bool(skipna): - warnings.warn( - "Passing None for skipna is deprecated and will raise in a future" - "version. Pass True instead. Only boolean values will be allowed " - "in the future.", - FutureWarning, - stacklevel=find_stack_level(), - ) + if skipna is None: skipna = True if axis is None: axis = self._stat_axis_number @@ -10808,7 +10666,7 @@ def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): see_also="", examples="", ) - def mad(self, axis=None, skipna=True, level=None): + def mad(self, axis=None, skipna=None, level=None): return NDFrame.mad(self, axis, skipna, level) setattr(cls, "mad", mad) From 51830d0e25c78e368d9a1de1006ce0e7b83eacf1 Mon Sep 17 00:00:00 2001 From: skwirskj Date: Sun, 28 Nov 2021 16:27:52 -0500 Subject: [PATCH 5/9] ENH: GH#44424 Updated whatsnew to explain changes --- doc/source/whatsnew/v1.4.0.rst | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 50156d4565bbd..d1e06aca7c7ae 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -186,6 +186,34 @@ representation of :class:`DataFrame` objects (:issue:`4889`). df df.to_dict(orient='tight') +.. _whatsnew_140.enhancements.shift: + +DataFrame.shift and Series.shift now accept an iterable for parameter ``'period'`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :meth:`DataFrame.shift` and :meth:`Series.shift` functions can take in an iterable, such as a list, for the period parameter. When an iterable is passed +to either function it returns a :class:`DataFrame` object with all of the shifted rows or columns concatenated with one another. +The function applies a shift designated by each element in the iterable. The resulting :class:`DataFrame` object's columns will retain the +names from the :class:`DataFrame` object that called shift, but postfixed with _, where name is the original +column name and num correlates to the current element of the period iterable (:issue:`44424`). + +Usage within the :class:`DataFrame` class: +.. ipython:: python + + df = pd.DataFrame({ + 'a': [1, 2, 3], + 'b': [4, 5, 6] + }) + shifts = [0, 1, 2] + df.shift(shifts) + +Usage within the :class:`Series` class: +.. ipython:: python + + ser = pd.Series([1, 2, 3]) + shifts = [0, 1, 2] + + .. _whatsnew_140.enhancements.other: Other enhancements From 9c41dce442042faab7a3fd1e9c29b47897800bd6 Mon Sep 17 00:00:00 2001 From: skwirskj Date: Sun, 28 Nov 2021 17:53:14 -0500 Subject: [PATCH 6/9] ENH: GH#44424 Added Series.shift call to whatsnew --- doc/source/whatsnew/v1.4.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index d1e06aca7c7ae..62674749d9c48 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -213,6 +213,7 @@ Usage within the :class:`Series` class: ser = pd.Series([1, 2, 3]) shifts = [0, 1, 2] + ser.shift(shifts) .. _whatsnew_140.enhancements.other: From bc96dbc2b52b5e2c5536d8f821df7ad28ea0c276 Mon Sep 17 00:00:00 2001 From: skwirskj Date: Mon, 29 Nov 2021 16:00:41 -0500 Subject: [PATCH 7/9] ENH: GH#44424 DataFrame.shift performance updates Updated how we are creating the new DataFrame after shifting to improve performance. Also reverted core/generic.py to master --- doc/source/whatsnew/v1.4.0.rst | 7 +- pandas/core/frame.py | 19 +-- pandas/core/generic.py | 272 ++++++++++++++++++++++++--------- 3 files changed, 215 insertions(+), 83 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 7b073dcfec81f..e0ea3aed20eec 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -188,14 +188,15 @@ representation of :class:`DataFrame` objects (:issue:`4889`). .. _whatsnew_140.enhancements.shift: -DataFrame.shift and Series.shift now accept an iterable for parameter ``'period'`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +DataFrame.shift and Series.shift now accept an iterable for parameter ``'period'`` and new parameter ``'suffix'`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The :meth:`DataFrame.shift` and :meth:`Series.shift` functions can take in an iterable, such as a list, for the period parameter. When an iterable is passed to either function it returns a :class:`DataFrame` object with all of the shifted rows or columns concatenated with one another. The function applies a shift designated by each element in the iterable. The resulting :class:`DataFrame` object's columns will retain the names from the :class:`DataFrame` object that called shift, but postfixed with _, where name is the original -column name and num correlates to the current element of the period iterable (:issue:`44424`). +column name and num correlates to the current element of the period iterable. The function also now takes in a ``'suffix'`` parameter to add a custom suffix +to the column names instead of adding the current element of the period iterable (:issue:`44424`). Usage within the :class:`DataFrame` class: .. ipython:: python diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a71ee52c75113..ec1467f64e617 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5326,32 +5326,33 @@ def shift( freq: Frequency | None = None, axis: Axis = 0, fill_value=lib.no_default, + suffix=None, ) -> DataFrame: axis = self._get_axis_number(axis) - # Handle the case of multiple shifts + # GH#44424 Handle the case of multiple shifts if is_list_like(periods): new_df = DataFrame() from pandas.core.reshape.concat import concat + new_df_list = [] + for i in periods: if not isinstance(i, int): raise TypeError( f"Value {i} in periods is not an integer, expected an integer" ) - new_df = concat( - [ - new_df, - super() - .shift(periods=i, freq=freq, axis=axis, fill_value=fill_value) - .add_suffix(f"_{i}"), - ], - axis=1, + new_df_list.append( + super() + .shift(periods=i, freq=freq, axis=axis, fill_value=fill_value) + .add_suffix(f"_{i}" if suffix is None else suffix) ) + new_df = concat(new_df_list, axis=1) + if new_df.empty: return self diff --git a/pandas/core/generic.py b/pandas/core/generic.py index fd8af2c0cedd0..4aff7acc4c6fb 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -12,7 +12,6 @@ from typing import ( TYPE_CHECKING, Any, - AnyStr, Callable, Hashable, Literal, @@ -44,7 +43,7 @@ Dtype, DtypeArg, DtypeObj, - FilePathOrBuffer, + FilePath, IndexKeyFunc, IndexLabel, JSONSerializable, @@ -58,6 +57,7 @@ TimedeltaConvertibleTypes, TimestampConvertibleTypes, ValueKeyFunc, + WriteBuffer, npt, ) from pandas.compat._optional import import_optional_dependency @@ -2332,7 +2332,7 @@ def to_excel( @doc(storage_options=_shared_docs["storage_options"]) def to_json( self, - path_or_buf: FilePathOrBuffer | None = None, + path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, orient: str | None = None, date_format: str | None = None, double_precision: int = 10, @@ -2353,9 +2353,10 @@ def to_json( Parameters ---------- - path_or_buf : str or file handle, optional - File path or object. If not specified, the result is returned as - a string. + path_or_buf : str, path object, file-like object, or None, default None + String, path object (implementing os.PathLike[str]), or file-like + object implementing a write() function. If None, the result is + returned as a string. orient : str Indication of expected JSON string format. @@ -3271,6 +3272,7 @@ def to_latex( {returns} See Also -------- + Styler.to_latex : Render a DataFrame to LaTeX with conditional formatting. DataFrame.to_string : Render a DataFrame to a console-friendly tabular output. DataFrame.to_html : Render a DataFrame as an HTML table. @@ -3280,7 +3282,7 @@ def to_latex( >>> df = pd.DataFrame(dict(name=['Raphael', 'Donatello'], ... mask=['red', 'purple'], ... weapon=['sai', 'bo staff'])) - >>> print(df.to_latex(index=False)) # doctest: +NORMALIZE_WHITESPACE + >>> print(df.to_latex(index=False)) # doctest: +SKIP \begin{{tabular}}{{lll}} \toprule name & mask & weapon \\ @@ -3290,6 +3292,15 @@ def to_latex( \bottomrule \end{{tabular}} """ + msg = ( + "In future versions `DataFrame.to_latex` is expected to utilise the base " + "implementation of `Styler.to_latex` for formatting and rendering. " + "The arguments signature may therefore change. It is recommended instead " + "to use `DataFrame.style.to_latex` which also contains additional " + "functionality." + ) + warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) + # Get defaults from the pandas config if self.ndim == 1: self = self.to_frame() @@ -3337,7 +3348,7 @@ def to_latex( @doc(storage_options=_shared_docs["storage_options"]) def to_csv( self, - path_or_buf: FilePathOrBuffer[AnyStr] | None = None, + path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, sep: str = ",", na_rep: str = "", float_format: str | None = None, @@ -3364,10 +3375,11 @@ def to_csv( Parameters ---------- - path_or_buf : str or file handle, default None - File path or object, if None is provided the result is returned as - a string. If a non-binary file object is passed, it should be opened - with `newline=''`, disabling universal newlines. If a binary + path_or_buf : str, path object, file-like object, or None, default None + String, path object (implementing os.PathLike[str]), or file-like + object implementing a write() function. If None, the result is + returned as a string. If a non-binary file object is passed, it should + be opened with `newline=''`, disabling universal newlines. If a binary file object is passed, `mode` might need to contain a `'b'`. .. versionchanged:: 1.2.0 @@ -3409,6 +3421,9 @@ def to_csv( and mode is one of {{'zip', 'gzip', 'bz2'}}, or inferred as one of the above, other entries passed as additional compression options. + If `path_or_buf` is omitted or `None` or is a file opened in text + mode, this argument is ignored and an (uncompressed) string is + returned/written. .. versionchanged:: 1.0.0 @@ -5296,10 +5311,9 @@ def sample( If weights do not sum to 1, they will be normalized to sum to 1. Missing values in the weights column will be treated as zero. Infinite values not allowed. - random_state : int, array-like, BitGenerator, np.random.RandomState, - np.random.Generator, optional. If int, array-like, or BitGenerator, seed for - random number generator. If np.random.RandomState or np.random.Generator, - use as given. + random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional + If int, array-like, or BitGenerator, seed for random number generator. + If np.random.RandomState or np.random.Generator, use as given. .. versionchanged:: 1.1.0 @@ -5388,7 +5402,7 @@ def sample( num_legs num_wings num_specimen_seen falcon 2 2 10 fish 0 0 8 - """ + """ # noqa:E501 if axis is None: axis = self._stat_axis_number @@ -5835,7 +5849,8 @@ def astype( if col_name not in self: raise KeyError( "Only a column name can be used for the " - "key in a dtype mappings argument." + "key in a dtype mappings argument. " + f"'{col_name}' not found in columns." ) # GH#44417 cast to Series so we can use .iat below, which will be @@ -7890,11 +7905,10 @@ def resample( level : str or int, optional For a MultiIndex, level (name or number) to use for resampling. `level` must be datetime-like. - origin : {{'epoch', 'start', 'start_day', 'end', 'end_day'}}, Timestamp - or str, default 'start_day' + origin : Timestamp or str, default 'start_day' The timestamp on which to adjust the grouping. The timezone of origin must match the timezone of the index. - If a timestamp is not used, these values are also supported: + If string, must be one of the following: - 'epoch': `origin` is 1970-01-01 - 'start': `origin` is the first value of the timeseries @@ -10284,7 +10298,14 @@ def pct_change( return rs @final - def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs): + def _agg_by_level( + self, + name: str, + axis: Axis = 0, + level: Level = 0, + skipna: bool_t = True, + **kwargs, + ): if axis is None: raise ValueError("Must specify 'axis' when aggregating by level.") grouped = self.groupby(level=level, axis=axis, sort=False) @@ -10297,9 +10318,17 @@ def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs): @final def _logical_func( - self, name: str, func, axis=0, bool_only=None, skipna=True, level=None, **kwargs - ): + self, + name: str, + func, + axis: Axis = 0, + bool_only: bool_t | None = None, + skipna: bool_t = True, + level: Level | None = None, + **kwargs, + ) -> Series | bool_t: nv.validate_logical_func((), kwargs, fname=name) + validate_bool_kwarg(skipna, "skipna", none_allowed=False) if level is not None: warnings.warn( "Using the level keyword in DataFrame and Series aggregations is " @@ -10330,18 +10359,40 @@ def _logical_func( filter_type="bool", ) - def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): + def any( + self, + axis: Axis = 0, + bool_only: bool_t | None = None, + skipna: bool_t = True, + level: Level | None = None, + **kwargs, + ) -> Series | bool_t: return self._logical_func( "any", nanops.nanany, axis, bool_only, skipna, level, **kwargs ) - def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): + def all( + self, + axis: Axis = 0, + bool_only: bool_t | None = None, + skipna: bool_t = True, + level: Level | None = None, + **kwargs, + ) -> Series | bool_t: return self._logical_func( "all", nanops.nanall, axis, bool_only, skipna, level, **kwargs ) @final - def _accum_func(self, name: str, func, axis=None, skipna=True, *args, **kwargs): + def _accum_func( + self, + name: str, + func, + axis: Axis | None = None, + skipna: bool_t = True, + *args, + **kwargs, + ): skipna = nv.validate_cum_func_with_skipna(skipna, args, kwargs, name) if axis is None: axis = self._stat_axis_number @@ -10365,20 +10416,20 @@ def block_accum_func(blk_values): return self._constructor(result).__finalize__(self, method=name) - def cummax(self, axis=None, skipna=True, *args, **kwargs): + def cummax(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs): return self._accum_func( "cummax", np.maximum.accumulate, axis, skipna, *args, **kwargs ) - def cummin(self, axis=None, skipna=True, *args, **kwargs): + def cummin(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs): return self._accum_func( "cummin", np.minimum.accumulate, axis, skipna, *args, **kwargs ) - def cumsum(self, axis=None, skipna=True, *args, **kwargs): + def cumsum(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs): return self._accum_func("cumsum", np.cumsum, axis, skipna, *args, **kwargs) - def cumprod(self, axis=None, skipna=True, *args, **kwargs): + def cumprod(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs): return self._accum_func("cumprod", np.cumprod, axis, skipna, *args, **kwargs) @final @@ -10386,14 +10437,15 @@ def _stat_function_ddof( self, name: str, func, - axis=None, - skipna=True, - level=None, - ddof=1, - numeric_only=None, + axis: Axis | None = None, + skipna: bool_t = True, + level: Level | None = None, + ddof: int = 1, + numeric_only: bool_t | None = None, **kwargs, - ): + ) -> Series | float: nv.validate_stat_ddof_func((), kwargs, fname=name) + validate_bool_kwarg(skipna, "skipna", none_allowed=False) if axis is None: axis = self._stat_axis_number if level is not None: @@ -10412,22 +10464,40 @@ def _stat_function_ddof( ) def sem( - self, axis=None, skipna=True, level=None, ddof=1, numeric_only=None, **kwargs - ): + self, + axis: Axis | None = None, + skipna: bool_t = True, + level: Level | None = None, + ddof: int = 1, + numeric_only: bool_t | None = None, + **kwargs, + ) -> Series | float: return self._stat_function_ddof( "sem", nanops.nansem, axis, skipna, level, ddof, numeric_only, **kwargs ) def var( - self, axis=None, skipna=True, level=None, ddof=1, numeric_only=None, **kwargs - ): + self, + axis: Axis | None = None, + skipna: bool_t = True, + level: Level | None = None, + ddof: int = 1, + numeric_only: bool_t | None = None, + **kwargs, + ) -> Series | float: return self._stat_function_ddof( "var", nanops.nanvar, axis, skipna, level, ddof, numeric_only, **kwargs ) def std( - self, axis=None, skipna=True, level=None, ddof=1, numeric_only=None, **kwargs - ): + self, + axis: Axis | None = None, + skipna: bool_t = True, + level: Level | None = None, + ddof: int = 1, + numeric_only: bool_t | None = None, + **kwargs, + ) -> Series | float: return self._stat_function_ddof( "std", nanops.nanstd, axis, skipna, level, ddof, numeric_only, **kwargs ) @@ -10437,16 +10507,19 @@ def _stat_function( self, name: str, func, - axis=None, - skipna=True, - level=None, - numeric_only=None, + axis: Axis | None = None, + skipna: bool_t = True, + level: Level | None = None, + numeric_only: bool_t | None = None, **kwargs, ): if name == "median": nv.validate_median((), kwargs) else: nv.validate_stat_func((), kwargs, fname=name) + + validate_bool_kwarg(skipna, "skipna", none_allowed=False) + if axis is None: axis = self._stat_axis_number if level is not None: @@ -10464,32 +10537,74 @@ def _stat_function( func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only ) - def min(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs): + def min( + self, + axis: Axis | None = None, + skipna: bool_t = True, + level: Level | None = None, + numeric_only: bool_t | None = None, + **kwargs, + ): return self._stat_function( "min", nanops.nanmin, axis, skipna, level, numeric_only, **kwargs ) - def max(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs): + def max( + self, + axis: Axis | None = None, + skipna: bool_t = True, + level: Level | None = None, + numeric_only: bool_t | None = None, + **kwargs, + ): return self._stat_function( "max", nanops.nanmax, axis, skipna, level, numeric_only, **kwargs ) - def mean(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs): + def mean( + self, + axis: Axis | None = None, + skipna: bool_t = True, + level: Level | None = None, + numeric_only: bool_t | None = None, + **kwargs, + ) -> Series | float: return self._stat_function( "mean", nanops.nanmean, axis, skipna, level, numeric_only, **kwargs ) - def median(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs): + def median( + self, + axis: Axis | None = None, + skipna: bool_t = True, + level: Level | None = None, + numeric_only: bool_t | None = None, + **kwargs, + ) -> Series | float: return self._stat_function( "median", nanops.nanmedian, axis, skipna, level, numeric_only, **kwargs ) - def skew(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs): + def skew( + self, + axis: Axis | None = None, + skipna: bool_t = True, + level: Level | None = None, + numeric_only: bool_t | None = None, + **kwargs, + ) -> Series | float: return self._stat_function( "skew", nanops.nanskew, axis, skipna, level, numeric_only, **kwargs ) - def kurt(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs): + def kurt( + self, + axis: Axis | None = None, + skipna: bool_t = True, + level: Level | None = None, + numeric_only: bool_t | None = None, + **kwargs, + ) -> Series | float: return self._stat_function( "kurt", nanops.nankurt, axis, skipna, level, numeric_only, **kwargs ) @@ -10501,11 +10616,11 @@ def _min_count_stat_function( self, name: str, func, - axis=None, - skipna=True, - level=None, - numeric_only=None, - min_count=0, + axis: Axis | None = None, + skipna: bool_t = True, + level: Level | None = None, + numeric_only: bool_t | None = None, + min_count: int = 0, **kwargs, ): if name == "sum": @@ -10514,6 +10629,9 @@ def _min_count_stat_function( nv.validate_prod((), kwargs) else: nv.validate_stat_func((), kwargs, fname=name) + + validate_bool_kwarg(skipna, "skipna", none_allowed=False) + if axis is None: axis = self._stat_axis_number if level is not None: @@ -10543,10 +10661,10 @@ def _min_count_stat_function( def sum( self, - axis=None, - skipna=True, - level=None, - numeric_only=None, + axis: Axis | None = None, + skipna: bool_t = True, + level: Level | None = None, + numeric_only: bool_t | None = None, min_count=0, **kwargs, ): @@ -10556,11 +10674,11 @@ def sum( def prod( self, - axis=None, - skipna=True, - level=None, - numeric_only=None, - min_count=0, + axis: Axis | None = None, + skipna: bool_t = True, + level: Level | None = None, + numeric_only: bool_t | None = None, + min_count: int = 0, **kwargs, ): return self._min_count_stat_function( @@ -10576,7 +10694,12 @@ def prod( product = prod - def mad(self, axis=None, skipna=None, level=None): + def mad( + self, + axis: Axis | None = None, + skipna: bool_t = True, + level: Level | None = None, + ) -> Series | float: """ {desc} @@ -10584,7 +10707,7 @@ def mad(self, axis=None, skipna=None, level=None): ---------- axis : {axis_descr} Axis for the function to be applied on. - skipna : bool, default None + skipna : bool, default True Exclude NA/null values when computing the result. level : int or level name, default None If the axis is a MultiIndex (hierarchical), count along a @@ -10596,7 +10719,14 @@ def mad(self, axis=None, skipna=None, level=None): {see_also}\ {examples} """ - if skipna is None: + if not is_bool(skipna): + warnings.warn( + "Passing None for skipna is deprecated and will raise in a future" + "version. Pass True instead. Only boolean values will be allowed " + "in the future.", + FutureWarning, + stacklevel=find_stack_level(), + ) skipna = True if axis is None: axis = self._stat_axis_number @@ -10666,7 +10796,7 @@ def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): see_also="", examples="", ) - def mad(self, axis=None, skipna=None, level=None): + def mad(self, axis=None, skipna=True, level=None): return NDFrame.mad(self, axis, skipna, level) setattr(cls, "mad", mad) From 25fda61400deb0dbc2211f52ff62f679db4f1a82 Mon Sep 17 00:00:00 2001 From: skwirskj Date: Mon, 29 Nov 2021 19:41:53 -0500 Subject: [PATCH 8/9] BUG: GH#44424 Formatting whatsnew Attempting to fix whatsnew rst file --- doc/source/whatsnew/v1.4.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index e0ea3aed20eec..760a4f5cb5c46 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -199,6 +199,7 @@ column name and num correlates to the current element of the period iterable. Th to the column names instead of adding the current element of the period iterable (:issue:`44424`). Usage within the :class:`DataFrame` class: + .. ipython:: python df = pd.DataFrame({ @@ -209,6 +210,7 @@ Usage within the :class:`DataFrame` class: df.shift(shifts) Usage within the :class:`Series` class: + .. ipython:: python ser = pd.Series([1, 2, 3]) From 8db013175314c58dc1624d104cc533fdb55476b2 Mon Sep 17 00:00:00 2001 From: skwirskj Date: Thu, 9 Dec 2021 12:13:09 -0500 Subject: [PATCH 9/9] ENH: GH#44424 Added typing hints since shift can return series or dataframe now --- pandas/core/series.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index ab0b5bbe4ed1a..309e7193afe6f 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -9,6 +9,7 @@ TYPE_CHECKING, Any, Callable, + Collection, Hashable, Iterable, Literal, @@ -4938,8 +4939,9 @@ def _replace_single(self, to_replace, method: str, inplace: bool, limit): # error: Cannot determine type of 'shift' @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"]) # type: ignore[has-type] - def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> Series: - + def shift( + self, periods: int | Collection[int] = 1, freq=None, axis=0, fill_value=None + ) -> Series | DataFrame: # Handle the case of multiple shifts if is_list_like(periods): if len(periods) == 0: