From ccaf2c8b8298126a54063b268a4f4980939beebc Mon Sep 17 00:00:00 2001 From: Deen-dot Date: Mon, 19 Feb 2024 21:51:08 +1100 Subject: [PATCH 1/8] DOC: Fix PR01 errors in multiple files (#57438) Fixed PR01 errors (#57438) and added parameter descriptions for: - `pandas.Grouper` - `pandas.core.groupby.DataFrameGroupBy.cummax` - `pandas.core.groupby.DataFrameGroupBy.cummin` - `pandas.core.groupby.DataFrameGroupBy.cumprod` - `pandas.core.groupby.DataFrameGroupBy.cumsum` - `pandas.core.groupby.DataFrameGroupBy.filter` - `pandas.core.groupby.DataFrameGroupBy.pct_change` - `pandas.core.groupby.DataFrameGroupBy.rolling` - `pandas.core.groupby.SeriesGroupBy.cummax` - `pandas.core.groupby.SeriesGroupBy.cummin` - `pandas.core.groupby.SeriesGroupBy.cumprod` - `pandas.core.groupby.SeriesGroupBy.cumsum` - `pandas.core.groupby.SeriesGroupBy.cumprod` Fixed functions also removed from code_checks.sh --- ci/code_checks.sh | 15 --------- pandas/core/groupby/generic.py | 9 ++++++ pandas/core/groupby/groupby.py | 56 ++++++++++++++++++++++++++++++++++ pandas/core/groupby/grouper.py | 4 +++ 4 files changed, 69 insertions(+), 15 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 231d40e17c0c0..04c3ff3a42971 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -1477,7 +1477,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas.DatetimeIndex.std\ pandas.ExcelFile\ pandas.ExcelFile.parse\ - pandas.Grouper\ pandas.HDFStore.append\ pandas.HDFStore.put\ pandas.Index.get_indexer_for\ @@ -1538,21 +1537,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas.api.types.is_float\ pandas.api.types.is_hashable\ pandas.api.types.is_integer\ - pandas.core.groupby.DataFrameGroupBy.cummax\ - pandas.core.groupby.DataFrameGroupBy.cummin\ - pandas.core.groupby.DataFrameGroupBy.cumprod\ - pandas.core.groupby.DataFrameGroupBy.cumsum\ - pandas.core.groupby.DataFrameGroupBy.filter\ - pandas.core.groupby.DataFrameGroupBy.pct_change\ - pandas.core.groupby.DataFrameGroupBy.rolling\ - pandas.core.groupby.SeriesGroupBy.cummax\ - pandas.core.groupby.SeriesGroupBy.cummin\ - pandas.core.groupby.SeriesGroupBy.cumprod\ - pandas.core.groupby.SeriesGroupBy.cumsum\ pandas.core.groupby.SeriesGroupBy.filter\ - pandas.core.groupby.SeriesGroupBy.nunique\ - pandas.core.groupby.SeriesGroupBy.pct_change\ - pandas.core.groupby.SeriesGroupBy.rolling\ pandas.core.resample.Resampler.max\ pandas.core.resample.Resampler.min\ pandas.core.resample.Resampler.quantile\ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index dfe145783c4a3..f396849929e0d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -723,6 +723,11 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame: """ Return number of unique elements in the group. + Parameters + ---------- + dropna : bool, default True + Don't include NaN in the counts. + Returns ------- Series @@ -1942,6 +1947,10 @@ def filter(self, func, dropna: bool = True, *args, **kwargs) -> DataFrame: dropna : bool Drop groups that do not pass the filter. True by default; if False, groups that evaluate False are filled with NaNs. + *args + Additional positional arguments to pass to `func`. + **kwargs + Additional keyword arguments to pass to `func`. Returns ------- diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 103f7b55c0550..f7b944e63a88b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -4672,6 +4672,13 @@ def cumprod(self, *args, **kwargs) -> NDFrameT: """ Cumulative product for each group. + Parameters + ---------- + *args : tuple + Positional arguments to be passed to `func`. + **kwargs : dict + Additional/specific keyword arguments to be passed to the function, such as `numeric_only` and `skipna`. + Returns ------- Series or DataFrame @@ -4721,6 +4728,13 @@ def cumprod(self, *args, **kwargs) -> NDFrameT: def cumsum(self, *args, **kwargs) -> NDFrameT: """ Cumulative sum for each group. + + Parameters + ---------- + *args : tuple + Positional arguments to be passed to `func`. + **kwargs : dict + Additional/specific keyword arguments to be passed to the function, such as `numeric_only` and `skipna`. Returns ------- @@ -4776,6 +4790,14 @@ def cummin( """ Cumulative min for each group. + Parameters + ---------- + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + **kwargs : dict, optional + Additional keyword arguments to be passed to the function, such as `skipna`, to control whether + NA/null values are ignored. + Returns ------- Series or DataFrame @@ -4838,6 +4860,14 @@ def cummax( """ Cumulative max for each group. + Parameters + ---------- + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + **kwargs : dict, optional + Additional keyword arguments to be passed to the function, such as `skipna`, to control whether + NA/null values are ignored. + Returns ------- Series or DataFrame @@ -5134,6 +5164,32 @@ def pct_change( """ Calculate pct_change of each value to previous entry in group. + Parameters + ---------- + periods : int, default 1 + Periods to shift for calculating percentage change. (E.g., 1 + compares adjacent elements, while a period of 2 compares every other element). + + fill_method : FillnaOptions or None, default None + This parameter specifies how to handle missing values after the initial shift operation + that is necessary for calculating the percentage change. In future versions, users will be + encouraged to manually handle missing values prior to using `pct_change`. Options for fill methods + (when explicitly specified) are as follows: + + - A FillnaOptions value (e.g., 'ffill', 'bfill') to specify forward or backward filling. + - None to indicate that no filling should be performed. + + Note: Direct use of this parameter is discouraged due to the impending deprecation. + + limit : int or None, default None + The maximum number of consecutive NA values to forward or backward fill, depending on the + `fill_method` chosen. This parameter's functionality is also subject to deprecation, and it is + recommended that NaN values be addressed prior to calling `pct_change`. + + freq : str, pandas offset object or None, default None + Increment to use from time series API (e.g., 'M' for month-end frequency). This parameter is + only relevant if the data is time series. If None, the frequency inferred from the index will be used. + Returns ------- Series or DataFrame diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index b5e4881ffc29c..9179bec86f660 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -68,6 +68,10 @@ class Grouper: Parameters ---------- + *args + Currently unused, reserved for future use. + **kwargs + Dictionary of the keyword arguments to pass to Grouper. key : str, defaults to None Groupby key, which selects the grouping column of the target. level : name/number, defaults to None From 66e62a33bcfbbf4e034eafb3c785c439bc6ebf3d Mon Sep 17 00:00:00 2001 From: Deen-dot Date: Mon, 19 Feb 2024 22:25:50 +1100 Subject: [PATCH 2/8] Updated formatting --- pandas/core/groupby/groupby.py | 46 ++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f7b944e63a88b..97366a12161b9 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -4677,7 +4677,8 @@ def cumprod(self, *args, **kwargs) -> NDFrameT: *args : tuple Positional arguments to be passed to `func`. **kwargs : dict - Additional/specific keyword arguments to be passed to the function, such as `numeric_only` and `skipna`. + Additional/specific keyword arguments to be passed to the function, + such as `numeric_only` and `skipna`. Returns ------- @@ -4734,7 +4735,8 @@ def cumsum(self, *args, **kwargs) -> NDFrameT: *args : tuple Positional arguments to be passed to `func`. **kwargs : dict - Additional/specific keyword arguments to be passed to the function, such as `numeric_only` and `skipna`. + Additional/specific keyword arguments to be passed to the function, + such as `numeric_only` and `skipna`. Returns ------- @@ -4795,8 +4797,8 @@ def cummin( numeric_only : bool, default False Include only `float`, `int` or `boolean` data. **kwargs : dict, optional - Additional keyword arguments to be passed to the function, such as `skipna`, to control whether - NA/null values are ignored. + Additional keyword arguments to be passed to the function, such as `skipna`, + to control whether NA/null values are ignored. Returns ------- @@ -5167,28 +5169,28 @@ def pct_change( Parameters ---------- periods : int, default 1 - Periods to shift for calculating percentage change. (E.g., 1 - compares adjacent elements, while a period of 2 compares every other element). + Periods to shift for calculating percentage change. Comparing with + a period of 1 means adjacent elements are compared, whereas a period + of 2 compares every other element. fill_method : FillnaOptions or None, default None - This parameter specifies how to handle missing values after the initial shift operation - that is necessary for calculating the percentage change. In future versions, users will be - encouraged to manually handle missing values prior to using `pct_change`. Options for fill methods - (when explicitly specified) are as follows: - - - A FillnaOptions value (e.g., 'ffill', 'bfill') to specify forward or backward filling. - - None to indicate that no filling should be performed. - - Note: Direct use of this parameter is discouraged due to the impending deprecation. + Specifies how to handle missing values after the initial shift + operation necessary for percentage change calculation. Users are + encouraged to handle missing values manually in future versions. + Valid options are: + - A FillnaOptions value ('ffill', 'bfill') for forward or backward filling. + - None to avoid filling. + Note: Usage is discouraged due to impending deprecation. limit : int or None, default None - The maximum number of consecutive NA values to forward or backward fill, depending on the - `fill_method` chosen. This parameter's functionality is also subject to deprecation, and it is - recommended that NaN values be addressed prior to calling `pct_change`. - - freq : str, pandas offset object or None, default None - Increment to use from time series API (e.g., 'M' for month-end frequency). This parameter is - only relevant if the data is time series. If None, the frequency inferred from the index will be used. + The maximum number of consecutive NA values to fill, based on the chosen + `fill_method`. Address NaN values prior to using `pct_change` as this + parameter is nearing deprecation. + + freq : str, pandas offset object, or None, default None + The frequency increment for time series data (e.g., 'M' for month-end). + If None, the frequency is inferred from the index. Relevant for time + series data only. Returns ------- From 84edc1ebdfdfea1dafe6dbec3012c388b8cc54e4 Mon Sep 17 00:00:00 2001 From: Deen-dot Date: Mon, 19 Feb 2024 22:30:19 +1100 Subject: [PATCH 3/8] Corrected E501 formatting --- pandas/core/groupby/groupby.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 97366a12161b9..bc4c2bf3223ba 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -4867,8 +4867,8 @@ def cummax( numeric_only : bool, default False Include only `float`, `int` or `boolean` data. **kwargs : dict, optional - Additional keyword arguments to be passed to the function, such as `skipna`, to control whether - NA/null values are ignored. + Additional keyword arguments to be passed to the function, such as `skipna`, + to control whether NA/null values are ignored. Returns ------- From 4caa5d60511350d2f897ed0f80768a9644f033da Mon Sep 17 00:00:00 2001 From: Deen-dot Date: Mon, 19 Feb 2024 22:44:32 +1100 Subject: [PATCH 4/8] Remove trailing whitespaces --- pandas/core/groupby/groupby.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index bc4c2bf3223ba..7c9fe0df9d022 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -4729,7 +4729,7 @@ def cumprod(self, *args, **kwargs) -> NDFrameT: def cumsum(self, *args, **kwargs) -> NDFrameT: """ Cumulative sum for each group. - + Parameters ---------- *args : tuple @@ -5169,27 +5169,27 @@ def pct_change( Parameters ---------- periods : int, default 1 - Periods to shift for calculating percentage change. Comparing with - a period of 1 means adjacent elements are compared, whereas a period + Periods to shift for calculating percentage change. Comparing with + a period of 1 means adjacent elements are compared, whereas a period of 2 compares every other element. fill_method : FillnaOptions or None, default None - Specifies how to handle missing values after the initial shift - operation necessary for percentage change calculation. Users are - encouraged to handle missing values manually in future versions. + Specifies how to handle missing values after the initial shift + operation necessary for percentage change calculation. Users are + encouraged to handle missing values manually in future versions. Valid options are: - A FillnaOptions value ('ffill', 'bfill') for forward or backward filling. - None to avoid filling. Note: Usage is discouraged due to impending deprecation. limit : int or None, default None - The maximum number of consecutive NA values to fill, based on the chosen - `fill_method`. Address NaN values prior to using `pct_change` as this + The maximum number of consecutive NA values to fill, based on the chosen + `fill_method`. Address NaN values prior to using `pct_change` as this parameter is nearing deprecation. freq : str, pandas offset object, or None, default None - The frequency increment for time series data (e.g., 'M' for month-end). - If None, the frequency is inferred from the index. Relevant for time + The frequency increment for time series data (e.g., 'M' for month-end). + If None, the frequency is inferred from the index. Relevant for time series data only. Returns From ce42a2ee6d6ca904cc53025f03d251ce0c78086f Mon Sep 17 00:00:00 2001 From: Deen-dot Date: Wed, 21 Feb 2024 01:39:20 +1100 Subject: [PATCH 5/8] Fix PR02 error from docstring inheritance for resampler nunique method Resolved a PR02 documentation error arising from the inherited `nunique` method docstring within `groupby.SeriesGroupBy.nunique`. The issue was due to the `resample.Resampler.nunique` method lacking a `dropna` parameter, which is present in the inherited docstring. - Introduced `_nunique_extra_params` to dynamically insert parameter documentation only where applicable (i.e. where `groupby.SeriesGroupBy.nunique`is). --- pandas/core/groupby/generic.py | 11 ++++++++--- pandas/core/resample.py | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index f396849929e0d..62c1325e01e12 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -719,15 +719,20 @@ def true_and_notna(x) -> bool: filtered = self._apply_filter(indices, dropna) return filtered - def nunique(self, dropna: bool = True) -> Series | DataFrame: + _nunique_extra_params = dedent( """ - Return number of unique elements in the group. - Parameters ---------- dropna : bool, default True Don't include NaN in the counts. + """ + ) + @doc(extra_params=_nunique_extra_params) + def nunique(self, dropna: bool = True) -> Series | DataFrame: + """ + Return number of unique elements in the group. + {extra_params} Returns ------- Series diff --git a/pandas/core/resample.py b/pandas/core/resample.py index e60dcdb10e653..3dd6f8994edd5 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1358,7 +1358,7 @@ def ohlc(self): return self._downsample("ohlc") @final - @doc(SeriesGroupBy.nunique) + @doc(SeriesGroupBy.nunique, extra_params="") def nunique(self): return self._downsample("nunique") From 4add81da7f4b4477c99b12a6926e40b83c3b38e8 Mon Sep 17 00:00:00 2001 From: Deen-dot Date: Wed, 21 Feb 2024 11:19:56 +1100 Subject: [PATCH 6/8] Resolve request --- pandas/core/groupby/generic.py | 36 +++++++--------------------------- pandas/core/resample.py | 33 +++++++++++++++++++++++++++++-- 2 files changed, 38 insertions(+), 31 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 62c1325e01e12..c90ae4d590b45 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -719,29 +719,26 @@ def true_and_notna(x) -> bool: filtered = self._apply_filter(indices, dropna) return filtered - _nunique_extra_params = dedent( + def nunique(self, dropna: bool = True) -> Series | DataFrame: """ + Return number of unique elements in the group. + Parameters ---------- dropna : bool, default True Don't include NaN in the counts. - """ - ) - @doc(extra_params=_nunique_extra_params) - def nunique(self, dropna: bool = True) -> Series | DataFrame: - """ - Return number of unique elements in the group. - {extra_params} Returns ------- Series Number of unique values within each group. - Examples + See Also -------- - For SeriesGroupby: + core.resample.Resampler.nunique : Method nunique for Resampler. + Examples + -------- >>> lst = ["a", "a", "b", "b"] >>> ser = pd.Series([1, 2, 3, 3], index=lst) >>> ser @@ -754,25 +751,6 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame: a 2 b 1 dtype: int64 - - For Resampler: - - >>> ser = pd.Series( - ... [1, 2, 3, 3], - ... index=pd.DatetimeIndex( - ... ["2023-01-01", "2023-01-15", "2023-02-01", "2023-02-15"] - ... ), - ... ) - >>> ser - 2023-01-01 1 - 2023-01-15 2 - 2023-02-01 3 - 2023-02-15 3 - dtype: int64 - >>> ser.resample("MS").nunique() - 2023-01-01 2 - 2023-02-01 1 - Freq: MS, dtype: int64 """ ids, ngroups = self._grouper.group_info val = self.obj._values diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 3dd6f8994edd5..4147437114b2f 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -59,7 +59,6 @@ NDFrame, _shared_docs, ) -from pandas.core.groupby.generic import SeriesGroupBy from pandas.core.groupby.groupby import ( BaseGroupBy, GroupBy, @@ -1358,8 +1357,38 @@ def ohlc(self): return self._downsample("ohlc") @final - @doc(SeriesGroupBy.nunique, extra_params="") def nunique(self): + """ + Return number of unique elements in the group. + + Returns + ------- + Series + Number of unique values within each group. + + See Also + -------- + core.groupby.SeriesGroupBy.nunique : Method nunique for SeriesGroupBy. + + Examples + -------- + >>> ser = pd.Series( + ... [1, 2, 3, 3], + ... index=pd.DatetimeIndex( + ... ["2023-01-01", "2023-01-15", "2023-02-01", "2023-02-15"] + ... ), + ... ) + >>> ser + 2023-01-01 1 + 2023-01-15 2 + 2023-02-01 3 + 2023-02-15 3 + dtype: int64 + >>> ser.resample("MS").nunique() + 2023-01-01 2 + 2023-02-01 1 + Freq: MS, dtype: int64 + """ return self._downsample("nunique") @final From 88398c045e8c96ff17fd9eb7b4fd3d36a4053646 Mon Sep 17 00:00:00 2001 From: Deen-dot Date: Wed, 21 Feb 2024 11:23:55 +1100 Subject: [PATCH 7/8] Undo mistake --- pandas/core/resample.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 4147437114b2f..bddb6f777482e 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -59,6 +59,7 @@ NDFrame, _shared_docs, ) +from pandas.core.groupby.generic import SeriesGroupBy from pandas.core.groupby.groupby import ( BaseGroupBy, GroupBy, From 68d66fc537f903c076fd9baf2866e673e4ef906d Mon Sep 17 00:00:00 2001 From: Deen-dot Date: Wed, 21 Feb 2024 11:38:29 +1100 Subject: [PATCH 8/8] Remove unnecessary import to fix ruff check --- pandas/core/resample.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index bddb6f777482e..4147437114b2f 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -59,7 +59,6 @@ NDFrame, _shared_docs, ) -from pandas.core.groupby.generic import SeriesGroupBy from pandas.core.groupby.groupby import ( BaseGroupBy, GroupBy,