From a73bd9a65022b7fddf5918fa8b939ac4492466ab Mon Sep 17 00:00:00 2001 From: Samuel Date: Tue, 20 Mar 2018 20:29:18 +0000 Subject: [PATCH 01/11] Fix docstring or pandas.DataFrame.stack. - Make description and summary clearer. - Fix doctests --- pandas/core/frame.py | 47 +++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index efb002474f876..3f2e7189ce74f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5145,36 +5145,51 @@ def pivot_table(self, values=None, index=None, columns=None, def stack(self, level=-1, dropna=True): """ - Pivot a level of the (possibly hierarchical) column labels, returning a - DataFrame (or Series in the case of an object with a single level of - column labels) having a hierarchical index with a new inner-most level - of row labels. - The level involved will automatically get sorted. + Stack the prescribed level(s) from the column axis onto the index + axis. + + Return a reshaped DataFrame or Series having a multi-level + index with one or more new inner-most levels compared to the current + dataframe. The new inner-most levels are created by pivoting the + columns of the current dataframe: + + - if the columns have a single level, the output is a Series; + - if the columns have multiple levels, the new index level + is taken from the prescribed level(s) and the output is a + DataFrame. + + The new index levels are sorted. Parameters ---------- - level : int, string, or list of these, default last level - Level(s) to stack, can pass level name + level : int, string, list, default last level + Level(s) to stack from the column axis, defined as + integers or strings. dropna : boolean, default True - Whether to drop rows in the resulting Frame/Series with no valid - values + Whether to drop rows in the resulting Frame/Series with no + valid values. Examples ---------- + >>> s = pd.DataFrame([[0, 1], [2, 3]], index=['one', 'two'], columns=['a', 'b']) >>> s a b - one 1. 2. - two 3. 4. - + one 0 1 + two 2 3 >>> s.stack() - one a 1 - b 2 - two a 3 - b 4 + one a 0 + b 1 + two a 2 + b 3 + dtype: int64 Returns ------- stacked : DataFrame or Series + + See Also + -------- + pandas.DataFrame.unstack: unstack prescribed level(s) from index axis onto column axis. """ from pandas.core.reshape.reshape import stack, stack_multiple From b93efe78f39f076c8b4179062e525b576dc3bbba Mon Sep 17 00:00:00 2001 From: Samuel Date: Thu, 22 Mar 2018 21:24:06 +0000 Subject: [PATCH 02/11] Add fill_value parameter to DataFrame.shift. --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 320e8eda20657..ac5f97cc6e832 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3619,9 +3619,9 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, method=method, axis=axis) @Appender(_shared_docs['shift'] % _shared_doc_kwargs) - def shift(self, periods=1, freq=None, axis=0): + def shift(self, periods=1, freq=None, axis=0, fill_value=np.nan): return super(DataFrame, self).shift(periods=periods, freq=freq, - axis=axis) + axis=axis, fill_value=fill_value) def set_index(self, keys, drop=True, append=False, inplace=False, verify_integrity=False): From 21ae4993012ced3007431c6991325f9f6a62cd35 Mon Sep 17 00:00:00 2001 From: Samuel Date: Thu, 22 Mar 2018 21:31:24 +0000 Subject: [PATCH 03/11] Initial implementation. --- pandas/core/generic.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bd1a2371315a0..8601f633e9661 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7539,15 +7539,15 @@ def mask(self, cond, other=np.nan, inplace=False, axis=None, level=None, """) @Appender(_shared_docs['shift'] % _shared_doc_kwargs) - def shift(self, periods=1, freq=None, axis=0): + def shift(self, periods=1, freq=None, axis=0, fill_value=np.nan): if periods == 0: return self block_axis = self._get_block_manager_axis(axis) if freq is None: - new_data = self._data.shift(periods=periods, axis=block_axis) + new_data = self._data.shift(periods=periods, axis=block_axis, fill_value=fill_value) else: - return self.tshift(periods, freq) + return self.tshift(periods, freq, fill_value) return self._constructor(new_data).__finalize__(self) @@ -7587,18 +7587,20 @@ def slice_shift(self, periods=1, axis=0): return new_obj.__finalize__(self) - def tshift(self, periods=1, freq=None, axis=0): + def tshift(self, periods=1, freq=None, axis=0, fill_value=np.nan): """ Shift the time index, using the index's frequency if available. Parameters ---------- periods : int - Number of periods to move, can be positive or negative + Number of periods to move, can be positive or negative. freq : DateOffset, timedelta, or time rule string, default None - Increment to use from the tseries module or time rule (e.g. 'EOM') + Increment to use from the tseries module or time rule (e.g. 'EOM'). axis : int or basestring - Corresponds to the axis that contains the Index + Corresponds to the axis that contains the Index. + fill_value : + Value to use to cover missing values. Notes ----- From caf1469f85dd1d92c0fe7765c69f05d9422e6a61 Mon Sep 17 00:00:00 2001 From: Samuel Date: Thu, 22 Mar 2018 21:36:08 +0000 Subject: [PATCH 04/11] Pass shift's fill_value parameter down the chain. --- pandas/core/internals.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index a0e122d390240..8ad523a64d868 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1279,12 +1279,12 @@ def diff(self, n, axis=1, mgr=None): new_values = algos.diff(self.values, n, axis=axis) return [self.make_block(values=new_values)] - def shift(self, periods, axis=0, mgr=None): + def shift(self, periods, axis=0, mgr=None, fill_value=np.nan): """ shift the block by periods, possibly upcast """ # convert integer to float if necessary. need to do a lot more than # that, handle boolean etc also - new_values, fill_value = maybe_upcast(self.values) + new_values, fill_value = maybe_upcast(self.values, fill_value) # make sure array sent to np.roll is c_contiguous f_ordered = new_values.flags.f_contiguous From b820f1f01fb7e9b94e2a466480088717d061b52a Mon Sep 17 00:00:00 2001 From: Samuel Date: Thu, 22 Mar 2018 21:41:20 +0000 Subject: [PATCH 05/11] remove the stack docstring commits --- pandas/core/frame.py | 49 ++++++++++++++------------------------------ 1 file changed, 15 insertions(+), 34 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ac5f97cc6e832..9baf65b461c9a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5145,51 +5145,32 @@ def pivot_table(self, values=None, index=None, columns=None, def stack(self, level=-1, dropna=True): """ - Stack the prescribed level(s) from the column axis onto the index - axis. - - Return a reshaped DataFrame or Series having a multi-level - index with one or more new inner-most levels compared to the current - dataframe. The new inner-most levels are created by pivoting the - columns of the current dataframe: - - - if the columns have a single level, the output is a Series; - - if the columns have multiple levels, the new index level - is taken from the prescribed level(s) and the output is a - DataFrame. - - The new index levels are sorted. - + Pivot a level of the (possibly hierarchical) column labels, returning a + DataFrame (or Series in the case of an object with a single level of + column labels) having a hierarchical index with a new inner-most level + of row labels. + The level involved will automatically get sorted. Parameters ---------- - level : int, string, list, default last level - Level(s) to stack from the column axis, defined as - integers or strings. + level : int, string, or list of these, default last level + Level(s) to stack, can pass level name dropna : boolean, default True - Whether to drop rows in the resulting Frame/Series with no - valid values. - + Whether to drop rows in the resulting Frame/Series with no valid + values Examples ---------- - >>> s = pd.DataFrame([[0, 1], [2, 3]], index=['one', 'two'], columns=['a', 'b']) >>> s a b - one 0 1 - two 2 3 + one 1. 2. + two 3. 4. >>> s.stack() - one a 0 - b 1 - two a 2 - b 3 - dtype: int64 - + one a 1 + b 2 + two a 3 + b 4 Returns ------- stacked : DataFrame or Series - - See Also - -------- - pandas.DataFrame.unstack: unstack prescribed level(s) from index axis onto column axis. """ from pandas.core.reshape.reshape import stack, stack_multiple From 6b0bcb8d9742d2dfa5faa855171481c60cc13a29 Mon Sep 17 00:00:00 2001 From: Samuel Date: Thu, 22 Mar 2018 21:42:20 +0000 Subject: [PATCH 06/11] Add missing lines --- pandas/core/frame.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9baf65b461c9a..64a793d7be05c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5150,6 +5150,7 @@ def stack(self, level=-1, dropna=True): column labels) having a hierarchical index with a new inner-most level of row labels. The level involved will automatically get sorted. + Parameters ---------- level : int, string, or list of these, default last level @@ -5157,17 +5158,20 @@ def stack(self, level=-1, dropna=True): dropna : boolean, default True Whether to drop rows in the resulting Frame/Series with no valid values + Examples ---------- >>> s a b one 1. 2. two 3. 4. + >>> s.stack() one a 1 b 2 two a 3 b 4 + Returns ------- stacked : DataFrame or Series From 0019217ca6b141983749e4eee63c2b9aa9ce5f63 Mon Sep 17 00:00:00 2001 From: Samuel Date: Sun, 25 Mar 2018 13:10:58 +0100 Subject: [PATCH 07/11] fix flake8 error: overly long line. --- pandas/core/generic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8601f633e9661..0a8e7763becab 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7545,7 +7545,8 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=np.nan): block_axis = self._get_block_manager_axis(axis) if freq is None: - new_data = self._data.shift(periods=periods, axis=block_axis, fill_value=fill_value) + new_data = self._data.shift(periods=periods, axis=block_axis, + fill_value=fill_value) else: return self.tshift(periods, freq, fill_value) From 7c318b2cc790e8ac08d6cd187e914407d11cb88f Mon Sep 17 00:00:00 2001 From: Samuel Date: Sun, 25 Mar 2018 13:52:38 +0100 Subject: [PATCH 08/11] Add Series.shift test for fillna parameter. Handles special case of Categorical variables where we don't want to pass the fillna parameter. --- pandas/core/generic.py | 6 ++++-- pandas/core/series.py | 5 +++-- pandas/tests/series/test_analytics.py | 11 +++++++++++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0a8e7763becab..4b24989c4029b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7544,9 +7544,11 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=np.nan): return self block_axis = self._get_block_manager_axis(axis) + shift_kwargs = {'periods': periods, 'axis': block_axis} + if not is_categorical_dtype(self): + shift_kwargs['fill_value'] = fill_value if freq is None: - new_data = self._data.shift(periods=periods, axis=block_axis, - fill_value=fill_value) + new_data = self._data.shift(**shift_kwargs) else: return self.tshift(periods, freq, fill_value) diff --git a/pandas/core/series.py b/pandas/core/series.py index 3e3600898ba7f..d8de1d8963f53 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3358,8 +3358,9 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, axis=axis) @Appender(generic._shared_docs['shift'] % _shared_doc_kwargs) - def shift(self, periods=1, freq=None, axis=0): - return super(Series, self).shift(periods=periods, freq=freq, axis=axis) + def shift(self, periods=1, freq=None, axis=0, fill_value=np.nan): + return super(Series, self).shift(periods=periods, freq=freq, axis=axis, + fill_value=fill_value) def reindex_axis(self, labels, axis=0, **kwargs): """Conform Series to new index with optional filling logic. diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 0e6e44e839464..c4c47ec9e7698 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1607,6 +1607,17 @@ def test_shift_int(self): expected = ts.astype(float).shift(1) assert_series_equal(shifted, expected) + def test_shift_fillna(self): + # ENH 15486 + ts = self.ts.astype(int) + fillval = 0 + shifted = ts.shift(1, fill_value=fillval) + # default behaviour adds nan so converts to floats + default = ts.shift(1) + default.iloc[0] = fillval + expected = default.astype(int) + assert_series_equal(shifted, expected) + def test_shift_categorical(self): # GH 9416 s = pd.Series(['a', 'b', 'c', 'd'], dtype='category') From bd943625ac8f6ccd29c839e66f04300615280792 Mon Sep 17 00:00:00 2001 From: Samuel Date: Sun, 25 Mar 2018 14:09:28 +0100 Subject: [PATCH 09/11] Add bool dataframe test with fillna. --- pandas/tests/frame/test_timeseries.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index ceb6c942c81b1..c8ffde727aa94 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -308,6 +308,14 @@ def test_shift_bool(self): columns=['high', 'low']) assert_frame_equal(rs, xp) + def test_shift_bool_fillna(self): + df = DataFrame({'high': [True, False], + 'low': [False, False]}) + rs = df.shift(1, fill_value=True) + xp = DataFrame({'high': [True, True], + 'low': [True, False]}) + assert_frame_equal(rs, xp) + def test_shift_categorical(self): # GH 9416 s1 = pd.Series(['a', 'b', 'c'], dtype='category') From 667a1440618ac3134dd625296ee16eb5b24d4b1e Mon Sep 17 00:00:00 2001 From: Samuel Date: Sun, 25 Mar 2018 14:10:02 +0100 Subject: [PATCH 10/11] Attempt to solve issue in tests around fillna. --- pandas/core/generic.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4b24989c4029b..24203e021a9be 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7550,7 +7550,10 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=np.nan): if freq is None: new_data = self._data.shift(**shift_kwargs) else: - return self.tshift(periods, freq, fill_value) + tshift_kwargs = {'periods': periods, 'freq': freq} + if not is_categorical_dtype(self): + tshift_kwargs['fill_value'] = fill_value + return self.tshift(**tshift_kwargs) return self._constructor(new_data).__finalize__(self) From 9600d538888c15b69a88dca83f6cc77adec9af86 Mon Sep 17 00:00:00 2001 From: Samuel Date: Sun, 29 Apr 2018 11:42:07 +0100 Subject: [PATCH 11/11] Add missing fill_value param to block managers Fix test_timeseries.py --- pandas/core/internals.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 8ad523a64d868..59be34ad16133 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2541,7 +2541,7 @@ def _try_coerce_result(self, result): return result - def shift(self, periods, axis=0, mgr=None): + def shift(self, periods, axis=0, mgr=None, fill_value=np.nan): return self.make_block_same_class(values=self.values.shift(periods), placement=self.mgr_locs) @@ -2879,7 +2879,7 @@ def _try_coerce_result(self, result): def _box_func(self): return lambda x: tslib.Timestamp(x, tz=self.dtype.tz) - def shift(self, periods, axis=0, mgr=None): + def shift(self, periods, axis=0, mgr=None, fill_value=np.nan): """ shift the block by periods """ # think about moving this to the DatetimeIndex. This is a non-freq @@ -3072,7 +3072,7 @@ def fillna(self, value, limit=None, inplace=False, downcast=None, return [self.make_block_same_class(values=values, placement=self.mgr_locs)] - def shift(self, periods, axis=0, mgr=None): + def shift(self, periods, axis=0, mgr=None, fill_value=np.nan): """ shift the block by periods """ N = len(self.values.T) indexer = np.zeros(N, dtype=int)