diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 83beec5607986..d2e0e62e573b0 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -29,7 +29,7 @@ Enhancements Other enhancements ^^^^^^^^^^^^^^^^^^ -- +- :meth:`Series.pct_change` and :meth:`DataFrame.pct_change` now accept a ``skipna`` argument (:issue:`25006`) - .. _whatsnew_1000.api_breaking: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6ade69fb4ca9d..117d9d9a27a58 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10293,6 +10293,10 @@ def _check_percentile(self, q): The number of consecutive NAs to fill before stopping. freq : DateOffset, timedelta, or offset alias string, optional Increment to use from time series API (e.g. 'M' or BDay()). + skipna : bool, default True + Exclude NA/null values before computing percent change. + + .. versionadded:: 1.0.0 **kwargs Additional keyword arguments are passed into `DataFrame.shift` or `Series.shift`. @@ -10309,6 +10313,11 @@ def _check_percentile(self, q): Series.shift : Shift the index by some number of periods. DataFrame.shift : Shift the index by some number of periods. + Notes + ----- + The default `skipna=True` drops NAs before computing the percentage + change, and the results are reindexed like the original calling object. + Examples -------- **Series** @@ -10332,22 +10341,53 @@ def _check_percentile(self, q): 2 -0.055556 dtype: float64 - See the percentage change in a Series where filling NAs with last - valid observation forward to next valid. + See how the computing of percentage change is performed in a Series + with NAs. With default `skipna=True`, NAs are dropped before the + computation and eventually the results are reindexed like the original + object, thus keeping the original NAs. - >>> s = pd.Series([90, 91, None, 85]) + >>> s = pd.Series([90, 91, np.nan, 85, np.nan, 95]) >>> s 0 90.0 1 91.0 2 NaN 3 85.0 + 4 NaN + 5 95.0 dtype: float64 + >>> s.pct_change() + 0 NaN + 1 0.011111 + 2 NaN + 3 -0.065934 + 4 NaN + 5 0.117647 + dtype: float64 + + By contrast, `skipna=False` will not drop NA values before + computation, instead evaluating each entry against the entry prior. + + >>> s.pct_change(skipna=False) + 0 NaN + 1 0.011111 + 2 NaN + 3 NaN + 4 NaN + 5 NaN + + On the other hand, if a fill method is passed, NAs are filled before + the computation. For example, before the computation of percentage + change, forward fill method `ffill` first fills NAs with last valid + observation forward to next valid. + >>> s.pct_change(fill_method='ffill') 0 NaN 1 0.011111 2 0.000000 3 -0.065934 + 4 0.000000 + 5 0.117647 dtype: float64 **DataFrame** @@ -10389,13 +10429,77 @@ def _check_percentile(self, q): 2016 2015 2014 GOOG NaN -0.151997 -0.086016 APPL NaN 0.337604 0.012002 + + In a DataFrame with NAs, when computing the percentage change with + default `skipna=True`, NAs are first droppped on each column/row, and + the results are eventually reindexed as originally. + + >>> df = pd.DataFrame({ + ... 'a': [90, 91, np.nan, 85, np.nan, 95], + ... 'b': [91, np.nan, 85, np.nan, 95, np.nan], + ... 'c': [np.nan, 85, np.nan, 95, np.nan, np.nan]}) + >>> df + a b c + 0 90.0 91.0 NaN + 1 91.0 NaN 85.0 + 2 NaN 85.0 NaN + 3 85.0 NaN 95.0 + 4 NaN 95.0 NaN + 5 95.0 NaN NaN + + >>> df.pct_change() + a b c + 0 NaN NaN NaN + 1 0.011111 NaN NaN + 2 NaN -0.065934 NaN + 3 -0.065934 NaN 0.117647 + 4 NaN 0.117647 NaN + 5 0.117647 NaN NaN + + >>> df.pct_change(axis=1) + a b c + 0 NaN 0.011111 NaN + 1 NaN NaN -0.065934 + 2 NaN NaN NaN + 3 NaN NaN 0.117647 + 4 NaN NaN NaN + 5 NaN NaN NaN + + Otherwise, if a fill method is passed, NAs are filled before the + computation. + + >>> df.pct_change(fill_method='ffill') + a b c + 0 NaN NaN NaN + 1 0.011111 0.000000 NaN + 2 0.000000 -0.065934 0.000000 + 3 -0.065934 0.000000 0.117647 + 4 0.000000 0.117647 0.000000 + 5 0.117647 0.000000 0.000000 """ @Appender(_shared_docs["pct_change"] % _shared_doc_kwargs) - def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, **kwargs): - # TODO: Not sure if above is correct - need someone to confirm. + def pct_change( + self, periods=1, fill_method=None, limit=None, freq=None, skipna=None, **kwargs + ): + if fill_method is not None and skipna: + raise ValueError("cannot pass both fill_method and skipna") + elif limit is not None and skipna: + raise ValueError("cannot pass both limit and skipna") + if fill_method is None and limit is None and skipna is None: + skipna = True axis = self._get_axis_number(kwargs.pop("axis", self._stat_axis_name)) - if fill_method is None: + if skipna and isinstance(self, ABCDataFrame): + # If DataFrame, apply to each column/row + return self.apply( + lambda s: s.pct_change( + periods=periods, freq=freq, skipna=skipna, **kwargs + ), + axis=axis, + ) + if skipna: + data = self.dropna() + elif fill_method is None: data = self else: data = self.fillna(method=fill_method, limit=limit, axis=axis) @@ -10405,6 +10509,8 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, **kwar if freq is None: mask = isna(com.values_from_object(data)) np.putmask(rs.values, mask, np.nan) + if skipna: + rs = rs.reindex_like(self) return rs def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs): diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index e99208ac78e15..56d310006927b 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1627,6 +1627,80 @@ def test_pct_change(self): tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "skipna, periods, expected_vals", + [ + ( + True, + 1, + [ + [np.nan, np.nan], + [np.nan, np.nan], + [1.0, np.nan], + [0.5, 1.0], + [np.nan, 0.5], + [0.33333333, np.nan], + [np.nan, 0.33333333], + ], + ), + ( + True, + 2, + [ + [np.nan, np.nan], + [np.nan, np.nan], + [np.nan, np.nan], + [2.0, np.nan], + [np.nan, 2.0], + [1.0, np.nan], + [np.nan, 1.0], + ], + ), + ( + False, + 1, + [ + [np.nan, np.nan], + [np.nan, np.nan], + [1.0, np.nan], + [0.5, 1.0], + [np.nan, 0.5], + [np.nan, np.nan], + [np.nan, np.nan], + ], + ), + ( + False, + 2, + [ + [np.nan, np.nan], + [np.nan, np.nan], + [np.nan, np.nan], + [2.0, np.nan], + [np.nan, 2.0], + [0.33333333, np.nan], + [np.nan, 0.33333333], + ], + ), + ], + ) + def test_pct_change_skipna(self, skipna, periods, expected_vals): + # GH25006 + df = DataFrame( + [ + [np.nan, np.nan], + [1.0, np.nan], + [2.0, 1.0], + [3.0, 2.0], + [np.nan, 3.0], + [4.0, np.nan], + [np.nan, 4.0], + ] + ) + result = df.pct_change(skipna=skipna, periods=periods) + expected = DataFrame(expected_vals) + tm.assert_frame_equal(result, expected) + # ---------------------------------------------------------------------- # Index of max / min diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index b8708e6ca1871..8c4ce7555526b 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -143,10 +143,10 @@ def test_diff_axis(self): assert_frame_equal(df.diff(axis=0), DataFrame([[np.nan, np.nan], [2.0, 2.0]])) def test_pct_change(self): - rs = self.tsframe.pct_change(fill_method=None) + rs = self.tsframe.pct_change(skipna=False, fill_method=None) assert_frame_equal(rs, self.tsframe / self.tsframe.shift(1) - 1) - rs = self.tsframe.pct_change(2) + rs = self.tsframe.pct_change(periods=2) filled = self.tsframe.fillna(method="pad") assert_frame_equal(rs, filled / filled.shift(2) - 1) @@ -165,7 +165,7 @@ def test_pct_change_shift_over_nas(self): df = DataFrame({"a": s, "b": s}) - chg = df.pct_change() + chg = df.pct_change(fill_method="ffill") expected = Series([np.nan, 0.5, 0.0, 2.5 / 1.5 - 1, 0.2]) edf = DataFrame({"a": expected, "b": expected}) assert_frame_equal(chg, edf) @@ -187,13 +187,15 @@ def test_pct_change_periods_freq(self, freq, periods, fill_method, limit): freq=freq, fill_method=fill_method, limit=limit ) rs_periods = self.tsframe.pct_change( - periods, fill_method=fill_method, limit=limit + periods=periods, fill_method=fill_method, limit=limit ) assert_frame_equal(rs_freq, rs_periods) empty_ts = DataFrame(index=self.tsframe.index, columns=self.tsframe.columns) rs_freq = empty_ts.pct_change(freq=freq, fill_method=fill_method, limit=limit) - rs_periods = empty_ts.pct_change(periods, fill_method=fill_method, limit=limit) + rs_periods = empty_ts.pct_change( + periods=periods, fill_method=fill_method, limit=limit + ) assert_frame_equal(rs_freq, rs_periods) def test_frame_ctor_datetime64_column(self): diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index 7b9e50ebbf342..d60b706091d96 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -585,6 +585,27 @@ def test_pct_change(self, periods, fill_method, limit, exp): else: tm.assert_series_equal(res, Series(exp)) + @pytest.mark.parametrize( + "fill_method, limit", + [ + ("backfill", None), + ("bfill", None), + ("pad", None), + ("ffill", None), + (None, 1), + ], + ) + def test_pct_change_skipna_raises(self, fill_method, limit): + # GH25006 + vals = [np.nan, np.nan, 1, 2, np.nan, 4, 10, np.nan] + obj = self._typ(vals) + if fill_method: + msg = "cannot pass both fill_method and skipna" + else: + msg = "cannot pass both limit and skipna" + with pytest.raises(ValueError, match=msg): + obj.pct_change(skipna=True, fill_method=fill_method, limit=limit) + class TestNDFrame: # tests that don't fit elsewhere diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 1ddaa4692d741..65589b6402c60 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -2,7 +2,6 @@ import operator import numpy as np -from numpy import nan import pytest import pandas.util._test_decorators as td @@ -228,6 +227,21 @@ def test_cummax_timedelta64(self): result = s.cummax(skipna=False) tm.assert_series_equal(expected, result) + @pytest.mark.parametrize( + "periods, expected_vals", + [ + (1, [np.nan, np.nan, 1.0, 0.5, np.nan, 0.333333333333333, np.nan]), + (2, [np.nan, np.nan, np.nan, 2.0, np.nan, 1.0, np.nan]), + ], + ) + def test_pct_change_skipna(self, periods, expected_vals): + # GH25006 + vals = [np.nan, 1.0, 2.0, 3.0, np.nan, 4.0, np.nan] + s = Series(vals) + result = s.pct_change(skipna=True, periods=periods) + expected = Series(expected_vals) + assert_series_equal(expected, result) + def test_npdiff(self): pytest.skip("skipping due to Series no longer being an ndarray") @@ -235,7 +249,7 @@ def test_npdiff(self): s = Series(np.arange(5)) r = np.diff(s) - assert_series_equal(Series([nan, 0, 0, 0, nan]), r) + assert_series_equal(Series([np.nan, 0, 0, 0, np.nan]), r) def _check_accum_op(self, name, datetime_series_, check_dtype=True): func = getattr(np, name) @@ -476,14 +490,14 @@ def test_count(self, datetime_series): assert datetime_series.count() == np.isfinite(datetime_series).sum() - mi = MultiIndex.from_arrays([list("aabbcc"), [1, 2, 2, nan, 1, 2]]) + mi = MultiIndex.from_arrays([list("aabbcc"), [1, 2, 2, np.nan, 1, 2]]) ts = Series(np.arange(len(mi)), index=mi) left = ts.count(level=1) - right = Series([2, 3, 1], index=[1, 2, nan]) + right = Series([2, 3, 1], index=[1, 2, np.nan]) assert_series_equal(left, right) - ts.iloc[[0, 3, 5]] = nan + ts.iloc[[0, 3, 5]] = np.nan assert_series_equal(ts.count(level=1), right - 1) def test_dot(self): @@ -708,11 +722,11 @@ def test_cummethods_bool(self): result = getattr(s, method)() assert_series_equal(result, expected) - e = pd.Series([False, True, nan, False]) - cse = pd.Series([0, 1, nan, 1], dtype=object) - cpe = pd.Series([False, 0, nan, 0]) - cmin = pd.Series([False, False, nan, False]) - cmax = pd.Series([False, True, nan, True]) + e = pd.Series([False, True, np.nan, False]) + cse = pd.Series([0, 1, np.nan, 1], dtype=object) + cpe = pd.Series([False, 0, np.nan, 0]) + cmin = pd.Series([False, False, np.nan, False]) + cmax = pd.Series([False, True, np.nan, True]) expecteds = {"cumsum": cse, "cumprod": cpe, "cummin": cmin, "cummax": cmax} for method in methods: @@ -980,8 +994,6 @@ def test_shift_categorical(self): assert_index_equal(s.values.categories, sn2.values.categories) def test_unstack(self): - from numpy import nan - index = MultiIndex( levels=[["bar", "foo"], ["one", "three", "two"]], codes=[[1, 1, 0, 0], [0, 1, 0, 2]], @@ -991,7 +1003,7 @@ def test_unstack(self): unstacked = s.unstack() expected = DataFrame( - [[2.0, nan, 3.0], [0.0, 1.0, nan]], + [[2.0, np.nan, 3.0], [0.0, 1.0, np.nan]], index=["bar", "foo"], columns=["one", "three", "two"], ) @@ -1018,7 +1030,9 @@ def test_unstack(self): idx = pd.MultiIndex.from_arrays([[101, 102], [3.5, np.nan]]) ts = pd.Series([1, 2], index=idx) left = ts.unstack() - right = DataFrame([[nan, 1], [2, nan]], index=[101, 102], columns=[nan, 3.5]) + right = DataFrame( + [[np.nan, 1], [2, np.nan]], index=[101, 102], columns=[np.nan, 3.5] + ) assert_frame_equal(left, right) idx = pd.MultiIndex.from_arrays( @@ -1030,9 +1044,10 @@ def test_unstack(self): ) ts = pd.Series([1.0, 1.1, 1.2, 1.3, 1.4], index=idx) right = DataFrame( - [[1.0, 1.3], [1.1, nan], [nan, 1.4], [1.2, nan]], columns=["cat", "dog"] + [[1.0, 1.3], [1.1, np.nan], [np.nan, 1.4], [1.2, np.nan]], + columns=["cat", "dog"], ) - tpls = [("a", 1), ("a", 2), ("b", nan), ("b", 1)] + tpls = [("a", 1), ("a", 2), ("b", np.nan), ("b", 1)] right.index = pd.MultiIndex.from_tuples(tpls) assert_frame_equal(ts.unstack(level=0), right) diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index d0ca5d82c6b33..f353c672b6315 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -401,7 +401,7 @@ def test_pct_change(self): rs = self.ts.pct_change(fill_method=None) assert_series_equal(rs, self.ts / self.ts.shift(1) - 1) - rs = self.ts.pct_change(2) + rs = self.ts.pct_change(periods=2) filled = self.ts.fillna(method="pad") assert_series_equal(rs, filled / filled.shift(2) - 1) @@ -418,7 +418,7 @@ def test_pct_change(self): def test_pct_change_shift_over_nas(self): s = Series([1.0, 1.5, np.nan, 2.5, 3.0]) - chg = s.pct_change() + chg = s.pct_change(fill_method="ffill") expected = Series([np.nan, 0.5, 0.0, 2.5 / 1.5 - 1, 0.2]) assert_series_equal(chg, expected) @@ -436,12 +436,16 @@ def test_pct_change_shift_over_nas(self): def test_pct_change_periods_freq(self, freq, periods, fill_method, limit): # GH 7292 rs_freq = self.ts.pct_change(freq=freq, fill_method=fill_method, limit=limit) - rs_periods = self.ts.pct_change(periods, fill_method=fill_method, limit=limit) + rs_periods = self.ts.pct_change( + periods=periods, fill_method=fill_method, limit=limit + ) assert_series_equal(rs_freq, rs_periods) empty_ts = Series(index=self.ts.index) rs_freq = empty_ts.pct_change(freq=freq, fill_method=fill_method, limit=limit) - rs_periods = empty_ts.pct_change(periods, fill_method=fill_method, limit=limit) + rs_periods = empty_ts.pct_change( + periods=periods, fill_method=fill_method, limit=limit + ) assert_series_equal(rs_freq, rs_periods) def test_autocorr(self):