diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 6ca951e946bad..48380bd9b46b8 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -369,6 +369,18 @@ def time_category_size(self): self.draws.groupby(self.cats).size() +class Shift: + def setup(self): + N = 18 + self.df = DataFrame({"g": ["a", "b"] * 9, "v": list(range(N))}) + + def time_defaults(self): + self.df.groupby("g").shift() + + def time_fill_value(self): + self.df.groupby("g").shift(fill_value=99) + + class FillNA: def setup(self): N = 100 diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 432dd46000eb3..1ca9104d3adf3 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -168,6 +168,7 @@ Performance improvements - Performance improvement in :meth:`.GroupBy.sample`, especially when ``weights`` argument provided (:issue:`34483`) - Performance improvement in :meth:`.GroupBy.transform` for user-defined functions (:issue:`41598`) - Performance improvement in constructing :class:`DataFrame` objects (:issue:`42631`) +- Performance improvement in :meth:`GroupBy.shift` when ``fill_value`` argument is provided (:issue:`26615`) .. --------------------------------------------------------------------------- @@ -262,6 +263,7 @@ Groupby/resample/rolling - Fixed bug in :meth:`SeriesGroupBy.apply` where passing an unrecognized string argument failed to raise ``TypeError`` when the underlying ``Series`` is empty (:issue:`42021`) - Bug in :meth:`Series.rolling.apply`, :meth:`DataFrame.rolling.apply`, :meth:`Series.expanding.apply` and :meth:`DataFrame.expanding.apply` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`42287`) - Bug in :meth:`DataFrame.groupby.rolling.var` would calculate the rolling variance only on the first group (:issue:`42442`) +- Bug in :meth:`GroupBy.shift` that would return the grouping columns if ``fill_value`` was not None (:issue:`41556`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 9c695148a75c0..f27c1d7eb0db5 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2822,6 +2822,7 @@ def _get_cythonized_result( result_is_index: bool = False, pre_processing=None, post_processing=None, + fill_value=None, **kwargs, ): """ @@ -2872,6 +2873,8 @@ def _get_cythonized_result( second argument, i.e. the signature should be (ndarray, Type). If `needs_nullable=True`, a third argument should be `nullable`, to allow for processing specific to nullable values. + fill_value : any, default None + The scalar value to use for newly introduced missing values. **kwargs : dict Extra arguments to be passed back to Cython funcs @@ -2946,7 +2949,7 @@ def blk_func(values: ArrayLike) -> ArrayLike: result = result.reshape(-1) if result_is_index: - result = algorithms.take_nd(values, result) + result = algorithms.take_nd(values, result, fill_value=fill_value) if post_processing: pp_kwargs = {} @@ -3022,7 +3025,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): tshift : Shift the time index, using the index’s frequency if available. """ - if freq is not None or axis != 0 or not isna(fill_value): + if freq is not None or axis != 0: return self.apply(lambda x: x.shift(periods, freq, axis, fill_value)) return self._get_cythonized_result( @@ -3032,6 +3035,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): needs_ngroups=True, result_is_index=True, periods=periods, + fill_value=fill_value, ) @final diff --git a/pandas/tests/groupby/test_groupby_shift_diff.py b/pandas/tests/groupby/test_groupby_shift_diff.py index c6f3e7618e3f7..e9517b4544f0b 100644 --- a/pandas/tests/groupby/test_groupby_shift_diff.py +++ b/pandas/tests/groupby/test_groupby_shift_diff.py @@ -55,7 +55,7 @@ def test_group_shift_with_fill_value(): columns=["Z"], index=None, ) - result = g.shift(-1, fill_value=0)[["Z"]] + result = g.shift(-1, fill_value=0) tm.assert_frame_equal(result, expected)