ENH: fill_value argument for shift #15486 (#24128)

ahcub · jreback · commit bf31c045ce45 · 2018-12-25T20:29:53.000-05:00
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -31,6 +31,7 @@ New features
 - :func:`read_feather` now accepts ``columns`` as an argument, allowing the user to specify which columns should be read. (:issue:`24025`)
 - :func:`DataFrame.to_html` now accepts ``render_links`` as an argument, allowing the user to generate HTML with links to any URLs that appear in the DataFrame.
   See the :ref:`section on writing HTML <io.html>` in the IO docs for example usage. (:issue:`2679`)
+- :meth:`DataFrame.shift` :meth:`Series.shift`, :meth:`ExtensionArray.shift`, :meth:`SparseArray.shift`, :meth:`Period.shift`, :meth:`GroupBy.shift`, :meth:`Categorical.shift`, :meth:`NDFrame.shift` and :meth:`Block.shift` now accept `fill_value` as an argument, allowing the user to specify a value which will be used instead of NA/NaT in the empty periods. (:issue:`15486`)
 
 .. _whatsnew_0240.values_api:
 
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -16,6 +16,7 @@
 
 from pandas.core.dtypes.common import is_list_like
 from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
+from pandas.core.dtypes.missing import isna
 
 from pandas.core import ops
 
@@ -449,8 +450,8 @@ def dropna(self):
         """
         return self[~self.isna()]
 
-    def shift(self, periods=1):
-        # type: (int) -> ExtensionArray
+    def shift(self, periods=1, fill_value=None):
+        # type: (int, object) -> ExtensionArray
         """
         Shift values by desired number.
 
@@ -465,6 +466,12 @@ def shift(self, periods=1):
             The number of periods to shift. Negative values are allowed
             for shifting backwards.
 
+        fill_value : object, optional
+            The scalar value to use for newly introduced missing values.
+            The default is ``self.dtype.na_value``
+
+            .. versionadded:: 0.24.0
+
         Returns
         -------
         shifted : ExtensionArray
@@ -483,8 +490,11 @@ def shift(self, periods=1):
         if not len(self) or periods == 0:
             return self.copy()
 
+        if isna(fill_value):
+            fill_value = self.dtype.na_value
+
         empty = self._from_sequence(
-            [self.dtype.na_value] * min(abs(periods), len(self)),
+            [fill_value] * min(abs(periods), len(self)),
             dtype=self.dtype
         )
         if periods > 0:
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -1257,14 +1257,18 @@ def shape(self):
 
         return tuple([len(self._codes)])
 
-    def shift(self, periods):
+    def shift(self, periods, fill_value=None):
         """
         Shift Categorical by desired number of periods.
 
         Parameters
         ----------
         periods : int
             Number of periods to move, can be positive or negative
+        fill_value : object, optional
+            The scalar value to use for newly introduced missing values.
+
+            .. versionadded:: 0.24.0
 
         Returns
         -------
@@ -1277,10 +1281,18 @@ def shift(self, periods):
             raise NotImplementedError("Categorical with ndim > 1.")
         if np.prod(codes.shape) and (periods != 0):
             codes = np.roll(codes, ensure_platform_int(periods), axis=0)
+            if isna(fill_value):
+                fill_value = -1
+            elif fill_value in self.categories:
+                fill_value = self.categories.get_loc(fill_value)
+            else:
+                raise ValueError("'fill_value={}' is not present "
+                                 "in this Categorical's "
+                                 "categories".format(fill_value))
             if periods > 0:
-                codes[:periods] = -1
+                codes[:periods] = fill_value
             else:
-                codes[periods:] = -1
+                codes[periods:] = fill_value
 
         return self.from_codes(codes, categories=self.categories,
                                ordered=self.ordered)
diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py
@@ -457,7 +457,7 @@ def value_counts(self, dropna=False):
 
     # --------------------------------------------------------------------
 
-    def shift(self, periods=1):
+    def shift(self, periods=1, fill_value=None):
         """
         Shift values by desired number.
 
@@ -471,6 +471,9 @@ def shift(self, periods=1):
         periods : int, default 1
             The number of periods to shift. Negative values are allowed
             for shifting backwards.
+        fill_value : optional, default NaT
+
+            .. versionadded:: 0.24.0
 
         Returns
         -------
@@ -479,7 +482,7 @@ def shift(self, periods=1):
         # TODO(DatetimeArray): remove
         # The semantics for Index.shift differ from EA.shift
         # then just call super.
-        return ExtensionArray.shift(self, periods)
+        return ExtensionArray.shift(self, periods, fill_value=fill_value)
 
     def _time_shift(self, n, freq=None):
         """
diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py
@@ -889,12 +889,15 @@ def fillna(self, value=None, method=None, limit=None):
 
         return self._simple_new(new_values, self._sparse_index, new_dtype)
 
-    def shift(self, periods=1):
+    def shift(self, periods=1, fill_value=None):
 
         if not len(self) or periods == 0:
             return self.copy()
 
-        subtype = np.result_type(np.nan, self.dtype.subtype)
+        if isna(fill_value):
+            fill_value = self.dtype.na_value
+
+        subtype = np.result_type(fill_value, self.dtype.subtype)
 
         if subtype != self.dtype.subtype:
             # just coerce up front
@@ -903,7 +906,7 @@ def shift(self, periods=1):
             arr = self
 
         empty = self._from_sequence(
-            [self.dtype.na_value] * min(abs(periods), len(self)),
+            [fill_value] * min(abs(periods), len(self)),
             dtype=arr.dtype
         )
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -3938,9 +3938,9 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None,
                                               method=method)
 
     @Appender(_shared_docs['shift'] % _shared_doc_kwargs)
-    def shift(self, periods=1, freq=None, axis=0):
+    def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         return super(DataFrame, self).shift(periods=periods, freq=freq,
-                                            axis=axis)
+                                            axis=axis, fill_value=fill_value)
 
     def set_index(self, keys, drop=True, append=False, inplace=False,
                   verify_integrity=False):
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -8849,6 +8849,14 @@ def mask(self, cond, other=np.nan, inplace=False, axis=None, level=None,
             extend the index when shifting and preserve the original data.
         axis : {0 or 'index', 1 or 'columns', None}, default None
             Shift direction.
+        fill_value : object, optional
+            The scalar value to use for newly introduced missing values.
+            the default depends on the dtype of `self`.
+            For numeric data, ``np.nan`` is used.
+            For datetime, timedelta, or period data, etc. :attr:`NaT` is used.
+            For extension dtypes, ``self.dtype.na_value`` is used.
+
+            .. versionchanged:: 0.24.0
 
         Returns
         -------
@@ -8884,16 +8892,25 @@ def mask(self, cond, other=np.nan, inplace=False, axis=None, level=None,
         2   NaN  15.0  18.0
         3   NaN  30.0  33.0
         4   NaN  45.0  48.0
+
+        >>> df.shift(periods=3, fill_value=0)
+           Col1  Col2  Col3
+        0     0     0     0
+        1     0     0     0
+        2     0     0     0
+        3    10    13    17
+        4    20    23    27
     """)
 
     @Appender(_shared_docs['shift'] % _shared_doc_kwargs)
-    def shift(self, periods=1, freq=None, axis=0):
+    def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         if periods == 0:
             return self.copy()
 
         block_axis = self._get_block_manager_axis(axis)
         if freq is None:
-            new_data = self._data.shift(periods=periods, axis=block_axis)
+            new_data = self._data.shift(periods=periods, axis=block_axis,
+                                        fill_value=fill_value)
         else:
             return self.tshift(periods, freq)
 
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -1994,7 +1994,7 @@ def _get_cythonized_result(self, how, grouper, aggregate=False,
 
     @Substitution(name='groupby')
     @Appender(_common_see_also)
-    def shift(self, periods=1, freq=None, axis=0):
+    def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         """
         Shift each group by periods observations.
 
@@ -2004,10 +2004,14 @@ def shift(self, periods=1, freq=None, axis=0):
             number of periods to shift
         freq : frequency string
         axis : axis to shift, default 0
+        fill_value : optional
+
+            .. versionadded:: 0.24.0
         """
 
-        if freq is not None or axis != 0:
-            return self.apply(lambda x: x.shift(periods, freq, axis))
+        if freq is not None or axis != 0 or not isna(fill_value):
+            return self.apply(lambda x: x.shift(periods, freq,
+                                                axis, fill_value))
 
         return self._get_cythonized_result('group_shift_indexer',
                                            self.grouper, cython_dtype=np.int64,
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -1261,12 +1261,12 @@ def diff(self, n, axis=1):
         new_values = algos.diff(self.values, n, axis=axis)
         return [self.make_block(values=new_values)]
 
-    def shift(self, periods, axis=0):
+    def shift(self, periods, axis=0, fill_value=None):
         """ shift the block by periods, possibly upcast """
 
         # convert integer to float if necessary. need to do a lot more than
         # that, handle boolean etc also
-        new_values, fill_value = maybe_upcast(self.values)
+        new_values, fill_value = maybe_upcast(self.values, fill_value)
 
         # make sure array sent to np.roll is c_contiguous
         f_ordered = new_values.flags.f_contiguous
@@ -1955,17 +1955,19 @@ def interpolate(self, method='pad', axis=0, inplace=False, limit=None,
                                  limit=limit),
             placement=self.mgr_locs)
 
-    def shift(self, periods, axis=0):
+    def shift(self, periods, axis=0, fill_value=None):
         """
         Shift the block by `periods`.
 
         Dispatches to underlying ExtensionArray and re-boxes in an
         ExtensionBlock.
         """
         # type: (int, Optional[BlockPlacement]) -> List[ExtensionBlock]
-        return [self.make_block_same_class(self.values.shift(periods=periods),
-                                           placement=self.mgr_locs,
-                                           ndim=self.ndim)]
+        return [
+            self.make_block_same_class(
+                self.values.shift(periods=periods, fill_value=fill_value),
+                placement=self.mgr_locs, ndim=self.ndim)
+        ]
 
     def where(self, other, cond, align=True, errors='raise',
               try_cast=False, axis=0, transpose=False):
@@ -3023,7 +3025,7 @@ def _try_coerce_result(self, result):
     def _box_func(self):
         return lambda x: tslibs.Timestamp(x, tz=self.dtype.tz)
 
-    def shift(self, periods, axis=0):
+    def shift(self, periods, axis=0, fill_value=None):
         """ shift the block by periods """
 
         # think about moving this to the DatetimeIndex. This is a non-freq
@@ -3038,10 +3040,12 @@ def shift(self, periods, axis=0):
 
         new_values = self.values.asi8.take(indexer)
 
+        if isna(fill_value):
+            fill_value = tslibs.iNaT
         if periods > 0:
-            new_values[:periods] = tslibs.iNaT
+            new_values[:periods] = fill_value
         else:
-            new_values[periods:] = tslibs.iNaT
+            new_values[periods:] = fill_value
 
         new_values = self.values._shallow_copy(new_values)
         return [self.make_block_same_class(new_values,
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -3765,8 +3765,9 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None,
                                            regex=regex, method=method)
 
     @Appender(generic._shared_docs['shift'] % _shared_doc_kwargs)
-    def shift(self, periods=1, freq=None, axis=0):
-        return super(Series, self).shift(periods=periods, freq=freq, axis=axis)
+    def shift(self, periods=1, freq=None, axis=0, fill_value=None):
+        return super(Series, self).shift(periods=periods, freq=freq, axis=axis,
+                                         fill_value=fill_value)
 
     def reindex_axis(self, labels, axis=0, **kwargs):
         """
diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py
@@ -10,6 +10,7 @@
 import pandas.util._test_decorators as td
 
 import pandas as pd
+from pandas import isna
 from pandas.core.sparse.api import SparseArray, SparseDtype, SparseSeries
 import pandas.util.testing as tm
 from pandas.util.testing import assert_almost_equal
@@ -262,6 +263,18 @@ def test_take_negative(self):
         exp = SparseArray(np.take(self.arr_data, [-4, -3, -2]))
         tm.assert_sp_array_equal(self.arr.take([-4, -3, -2]), exp)
 
+    @pytest.mark.parametrize('fill_value', [0, None, np.nan])
+    def test_shift_fill_value(self, fill_value):
+        # GH #24128
+        sparse = SparseArray(np.array([1, 0, 0, 3, 0]),
+                             fill_value=8.0)
+        res = sparse.shift(1, fill_value=fill_value)
+        if isna(fill_value):
+            fill_value = res.dtype.na_value
+        exp = SparseArray(np.array([fill_value, 1, 0, 0, 3]),
+                          fill_value=8.0)
+        tm.assert_sp_array_equal(res, exp)
+
     def test_bad_take(self):
         with pytest.raises(IndexError, match="bounds"):
             self.arr.take([11])
diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
@@ -221,6 +221,17 @@ def test_shift_empty_array(self, data, periods):
         expected = empty
         self.assert_extension_array_equal(result, expected)
 
+    def test_shift_fill_value(self, data):
+        arr = data[:4]
+        fill_value = data[0]
+        result = arr.shift(1, fill_value=fill_value)
+        expected = data.take([0, 0, 1, 2])
+        self.assert_extension_array_equal(result, expected)
+
+        result = arr.shift(-2, fill_value=fill_value)
+        expected = data.take([2, 3, 0, 0])
+        self.assert_extension_array_equal(result, expected)
+
     @pytest.mark.parametrize("as_frame", [True, False])
     def test_hash_pandas_object_works(self, data, as_frame):
         # https://github.com/pandas-dev/pandas/issues/23066
diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py
@@ -320,6 +320,20 @@ def test_shift_categorical(self):
         xp = DataFrame({'one': s1.shift(1), 'two': s2.shift(1)})
         assert_frame_equal(rs, xp)
 
+    def test_shift_fill_value(self):
+        # GH #24128
+        df = DataFrame([1, 2, 3, 4, 5],
+                       index=date_range('1/1/2000', periods=5, freq='H'))
+        exp = DataFrame([0, 1, 2, 3, 4],
+                        index=date_range('1/1/2000', periods=5, freq='H'))
+        result = df.shift(1, fill_value=0)
+        assert_frame_equal(result, exp)
+
+        exp = DataFrame([0, 0, 1, 2, 3],
+                        index=date_range('1/1/2000', periods=5, freq='H'))
+        result = df.shift(2, fill_value=0)
+        assert_frame_equal(result, exp)
+
     def test_shift_empty(self):
         # Regression test for #8019
         df = DataFrame({'foo': []})
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
@@ -9,7 +9,8 @@
 from pandas.compat import PY37
 from pandas import (Index, MultiIndex, CategoricalIndex,
                     DataFrame, Categorical, Series, qcut)
-from pandas.util.testing import assert_frame_equal, assert_series_equal
+from pandas.util.testing import (assert_equal,
+                                 assert_frame_equal, assert_series_equal)
 import pandas.util.testing as tm
 
 
@@ -860,3 +861,13 @@ def test_groupby_multiindex_categorical_datetime():
     expected = pd.DataFrame(
         {'values': [0, 4, 8, 3, 4, 5, 6, np.nan, 2]}, index=idx)
     assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize('fill_value', [None, np.nan, pd.NaT])
+def test_shift(fill_value):
+    ct = pd.Categorical(['a', 'b', 'c', 'd'],
+                        categories=['a', 'b', 'c', 'd'], ordered=False)
+    expected = pd.Categorical([None, 'a', 'b', 'c'],
+                              categories=['a', 'b', 'c', 'd'], ordered=False)
+    res = ct.shift(1, fill_value=fill_value)
+    assert_equal(res, expected)
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py