Merge remote-tracking branch 'upstream/master' into fix-39904

DriesSchaumont · DriesSchaumont · commit da2a451e39cf · 2021-03-29T15:34:53.000+02:00
diff --git a/doc/source/user_guide/window.rst b/doc/source/user_guide/window.rst
@@ -101,7 +101,7 @@ be calculated with :meth:`~Rolling.apply` by specifying a separate column of wei
 
 All windowing operations support a ``min_periods`` argument that dictates the minimum amount of
 non-``np.nan`` values a window must have; otherwise, the resulting value is ``np.nan``.
-``min_peridos`` defaults to 1 for time-based windows and ``window`` for fixed windows
+``min_periods`` defaults to 1 for time-based windows and ``window`` for fixed windows
 
 .. ipython:: python
 
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -302,6 +302,38 @@ cast to ``dtype=object`` (:issue:`38709`)
    ser2
 
 
+.. _whatsnew_130.notable_bug_fixes.rolling_groupby_column:
+
+GroupBy.rolling no longer returns grouped-by column in values
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The group-by column will now be dropped from the result of a
+``groupby.rolling`` operation (:issue:`32262`)
+
+.. ipython:: python
+
+    df = pd.DataFrame({"A": [1, 1, 2, 3], "B": [0, 1, 2, 3]})
+    df
+
+*Previous behavior*:
+
+.. code-block:: ipython
+
+    In [1]: df.groupby("A").rolling(2).sum()
+    Out[1]:
+           A    B
+    A
+    1 0  NaN  NaN
+    1    2.0  1.0
+    2 2  NaN  NaN
+    3 3  NaN  NaN
+
+*New behavior*:
+
+.. ipython:: python
+
+    df.groupby("A").rolling(2).sum()
+
 .. _whatsnew_130.notable_bug_fixes.rolling_var_precision:
 
 Removed artificial truncation in rolling variance and standard deviation
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -681,18 +681,17 @@ group_mean_float64 = _group_mean['double']
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def _group_ohlc(floating[:, ::1] out,
-                int64_t[::1] counts,
-                ndarray[floating, ndim=2] values,
-                const intp_t[:] labels,
-                Py_ssize_t min_count=-1):
+def group_ohlc(floating[:, ::1] out,
+               int64_t[::1] counts,
+               ndarray[floating, ndim=2] values,
+               const intp_t[:] labels,
+               Py_ssize_t min_count=-1):
     """
     Only aggregates on axis=0
     """
     cdef:
         Py_ssize_t i, j, N, K, lab
-        floating val, count
-        Py_ssize_t ngroups = len(counts)
+        floating val
 
     assert min_count == -1, "'min_count' only used in add and prod"
 
@@ -727,10 +726,6 @@ def _group_ohlc(floating[:, ::1] out,
                 out[lab, 3] = val
 
 
-group_ohlc_float32 = _group_ohlc['float']
-group_ohlc_float64 = _group_ohlc['double']
-
-
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def group_quantile(ndarray[float64_t] out,
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -542,7 +542,7 @@ class DataFrame(NDFrame, OpsMixin):
     >>> from dataclasses import make_dataclass
     >>> Point = make_dataclass("Point", [("x", int), ("y", int)])
     >>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)])
-        x  y
+       x  y
     0  0  0
     1  0  3
     2  2  3
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -355,15 +355,15 @@ def _aggregate_multiple_funcs(self, arg):
     # TODO: index should not be Optional - see GH 35490
     def _wrap_series_output(
         self,
-        output: Mapping[base.OutputKey, Union[Series, np.ndarray]],
+        output: Mapping[base.OutputKey, Union[Series, ArrayLike]],
         index: Optional[Index],
     ) -> FrameOrSeriesUnion:
         """
         Wraps the output of a SeriesGroupBy operation into the expected result.
 
         Parameters
         ----------
-        output : Mapping[base.OutputKey, Union[Series, np.ndarray]]
+        output : Mapping[base.OutputKey, Union[Series, np.ndarray, ExtensionArray]]
             Data to wrap.
         index : pd.Index or None
             Index to apply to the output.
@@ -420,14 +420,14 @@ def _wrap_aggregated_output(
         return self._reindex_output(result)
 
     def _wrap_transformed_output(
-        self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]]
+        self, output: Mapping[base.OutputKey, Union[Series, ArrayLike]]
     ) -> Series:
         """
         Wraps the output of a SeriesGroupBy aggregation into the expected result.
 
         Parameters
         ----------
-        output : dict[base.OutputKey, Union[Series, np.ndarray]]
+        output : dict[base.OutputKey, Union[Series, np.ndarray, ExtensionArray]]
             Dict with a sole key of 0 and a value of the result values.
 
         Returns
@@ -1119,6 +1119,7 @@ def cast_agg_result(result, values: ArrayLike, how: str) -> ArrayLike:
 
             if isinstance(values, Categorical) and isinstance(result, np.ndarray):
                 # If the Categorical op didn't raise, it is dtype-preserving
+                # We get here with how="first", "last", "min", "max"
                 result = type(values)._from_sequence(result.ravel(), dtype=values.dtype)
                 # Note this will have result.dtype == dtype from above
 
@@ -1195,9 +1196,7 @@ def array_func(values: ArrayLike) -> ArrayLike:
                     assert how == "ohlc"
                     raise
 
-                # error: Incompatible types in assignment (expression has type
-                # "ExtensionArray", variable has type "ndarray")
-                result = py_fallback(values)  # type: ignore[assignment]
+                result = py_fallback(values)
 
             return cast_agg_result(result, values, how)
 
@@ -1753,14 +1752,14 @@ def _wrap_aggregated_output(
         return self._reindex_output(result)
 
     def _wrap_transformed_output(
-        self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]]
+        self, output: Mapping[base.OutputKey, Union[Series, ArrayLike]]
     ) -> DataFrame:
         """
         Wraps the output of DataFrameGroupBy transformations into the expected result.
 
         Parameters
         ----------
-        output : Mapping[base.OutputKey, Union[Series, np.ndarray]]
+        output : Mapping[base.OutputKey, Union[Series, np.ndarray, ExtensionArray]]
             Data to wrap.
 
         Returns
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -84,7 +84,6 @@ class providing the base-class of operations.
 import pandas.core.algorithms as algorithms
 from pandas.core.arrays import (
     Categorical,
-    DatetimeArray,
     ExtensionArray,
 )
 from pandas.core.base import (
@@ -1026,7 +1025,7 @@ def _cumcount_array(self, ascending: bool = True):
     def _cython_transform(
         self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs
     ):
-        output: Dict[base.OutputKey, np.ndarray] = {}
+        output: Dict[base.OutputKey, ArrayLike] = {}
 
         for idx, obj in enumerate(self._iterate_slices()):
             name = obj.name
@@ -1054,7 +1053,7 @@ def _wrap_aggregated_output(
     ):
         raise AbstractMethodError(self)
 
-    def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]):
+    def _wrap_transformed_output(self, output: Mapping[base.OutputKey, ArrayLike]):
         raise AbstractMethodError(self)
 
     def _wrap_applied_output(self, data, keys, values, not_indexed_same: bool = False):
@@ -1099,7 +1098,7 @@ def _agg_general(
     def _cython_agg_general(
         self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1
     ):
-        output: Dict[base.OutputKey, Union[np.ndarray, DatetimeArray]] = {}
+        output: Dict[base.OutputKey, ArrayLike] = {}
         # Ideally we would be able to enumerate self._iterate_slices and use
         # the index from enumeration as the key of output, but ohlc in particular
         # returns a (n x 4) array. Output requires 1D ndarrays as values, so we
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -31,6 +31,7 @@
 import pandas._libs.groupby as libgroupby
 import pandas._libs.reduction as libreduction
 from pandas._typing import (
+    ArrayLike,
     DtypeObj,
     F,
     FrameOrSeries,
@@ -485,6 +486,12 @@ def _get_cython_func_and_vals(
                 func = _get_cython_function(kind, how, values.dtype, is_numeric)
             else:
                 raise
+        else:
+            if values.dtype.kind in ["i", "u"]:
+                if how in ["ohlc"]:
+                    # The output may still include nans, so we have to cast
+                    values = ensure_float64(values)
+
         return func, values
 
     @final
@@ -524,7 +531,7 @@ def _disallow_invalid_ops(
     @final
     def _ea_wrap_cython_operation(
         self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs
-    ) -> np.ndarray:
+    ) -> ArrayLike:
         """
         If we have an ExtensionArray, unwrap, call _cython_operation, and
         re-wrap if appropriate.
@@ -576,7 +583,7 @@ def _ea_wrap_cython_operation(
     @final
     def _cython_operation(
         self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs
-    ) -> np.ndarray:
+    ) -> ArrayLike:
         """
         Returns the values of a cython operation.
         """
@@ -683,11 +690,11 @@ def _cython_operation(
             # e.g. if we are int64 and need to restore to datetime64/timedelta64
             # "rank" is the only member of cython_cast_blocklist we get here
             dtype = maybe_cast_result_dtype(orig_values.dtype, how)
-            # error: Incompatible types in assignment (expression has type
-            # "Union[ExtensionArray, ndarray]", variable has type "ndarray")
-            result = maybe_downcast_to_dtype(result, dtype)  # type: ignore[assignment]
+            op_result = maybe_downcast_to_dtype(result, dtype)
+        else:
+            op_result = result
 
-        return result
+        return op_result
 
     @final
     def _aggregate(
@@ -784,14 +791,10 @@ def _aggregate_series_pure_python(self, obj: Series, func: F):
             counts[label] = group.shape[0]
             result[label] = res
 
-        result = lib.maybe_convert_objects(result, try_float=False)
-        # error: Incompatible types in assignment (expression has type
-        # "Union[ExtensionArray, ndarray]", variable has type "ndarray")
-        result = maybe_cast_result(  # type: ignore[assignment]
-            result, obj, numeric_only=True
-        )
+        out = lib.maybe_convert_objects(result, try_float=False)
+        out = maybe_cast_result(out, obj, numeric_only=True)
 
-        return result, counts
+        return out, counts
 
 
 class BinGrouper(BaseGrouper):
diff --git a/pandas/core/missing.py b/pandas/core/missing.py
@@ -861,7 +861,4 @@ def _rolling_window(a: np.ndarray, window: int):
     # https://stackoverflow.com/a/6811241
     shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
     strides = a.strides + (a.strides[-1],)
-    # error: Module has no attribute "stride_tricks"
-    return np.lib.stride_tricks.as_strided(  # type: ignore[attr-defined]
-        a, shape=shape, strides=strides
-    )
+    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
@@ -1925,13 +1925,13 @@ def get_dummies(self, sep="|"):
         Examples
         --------
         >>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies()
-        a  b  c
+           a  b  c
         0  1  1  0
         1  1  0  0
         2  1  0  1
 
         >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies()
-        a  b  c
+           a  b  c
         0  1  1  0
         1  0  0  0
         2  1  0  1
diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py
@@ -558,6 +558,10 @@ def __init__(
         if _grouper is None:
             raise ValueError("Must pass a Grouper object.")
         self._grouper = _grouper
+        # GH 32262: It's convention to keep the grouping column in
+        # groupby.<agg_func>, but unexpected to users in
+        # groupby.rolling.<agg_func>
+        obj = obj.drop(columns=self._grouper.names, errors="ignore")
         super().__init__(obj, *args, **kwargs)
 
     def _apply(
diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py
@@ -579,13 +579,24 @@ def legend_title(self) -> Optional[str]:
             stringified = map(pprint_thing, self.data.columns.names)
             return ",".join(stringified)
 
-    def _add_legend_handle(self, handle, label, index=None):
-        if label is not None:
-            if self.mark_right and index is not None:
-                if self.on_right(index):
-                    label = label + " (right)"
-            self.legend_handles.append(handle)
-            self.legend_labels.append(label)
+    def _mark_right_label(self, label: str, index: int) -> str:
+        """
+        Append ``(right)`` to the label of a line if it's plotted on the right axis.
+
+        Note that ``(right)`` is only appended when ``subplots=False``.
+        """
+        if not self.subplots and self.mark_right and self.on_right(index):
+            label += " (right)"
+        return label
+
+    def _append_legend_handles_labels(self, handle: Artist, label: str) -> None:
+        """
+        Append current handle and label to ``legend_handles`` and ``legend_labels``.
+
+        These will be used to make the legend.
+        """
+        self.legend_handles.append(handle)
+        self.legend_labels.append(label)
 
     def _make_legend(self):
         ax, leg, handle = self._get_ax_legend_handle(self.axes[0])
@@ -1078,7 +1089,7 @@ def _make_plot(self):
                 cbar.ax.set_yticklabels(self.data[c].cat.categories)
 
         if label is not None:
-            self._add_legend_handle(scatter, label)
+            self._append_legend_handles_labels(scatter, label)
         else:
             self.legend = False
 
@@ -1170,6 +1181,7 @@ def _make_plot(self):
             kwds = dict(kwds, **errors)
 
             label = pprint_thing(label)  # .encode('utf-8')
+            label = self._mark_right_label(label, index=i)
             kwds["label"] = label
 
             newlines = plotf(
@@ -1182,7 +1194,7 @@ def _make_plot(self):
                 is_errorbar=is_errorbar,
                 **kwds,
             )
-            self._add_legend_handle(newlines[0], label, index=i)
+            self._append_legend_handles_labels(newlines[0], label)
 
             if self._is_ts_plot():
 
@@ -1458,6 +1470,7 @@ def _make_plot(self):
             kwds = dict(kwds, **errors)
 
             label = pprint_thing(label)
+            label = self._mark_right_label(label, index=i)
 
             if (("yerr" in kwds) or ("xerr" in kwds)) and (kwds.get("ecolor") is None):
                 kwds["ecolor"] = mpl.rcParams["xtick.color"]
@@ -1508,7 +1521,7 @@ def _make_plot(self):
                     log=self.log,
                     **kwds,
                 )
-            self._add_legend_handle(rect, label, index=i)
+            self._append_legend_handles_labels(rect, label)
 
     def _post_plot_logic(self, ax: Axes, data):
         if self.use_index:
@@ -1620,4 +1633,4 @@ def blank_labeler(label, value):
             # leglabels is used for legend labels
             leglabels = labels if labels is not None else idx
             for p, l in zip(patches, leglabels):
-                self._add_legend_handle(p, l)
+                self._append_legend_handles_labels(p, l)
diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py
diff --git a/pandas/tests/groupby/test_libgroupby.py b/pandas/tests/groupby/test_libgroupby.py
diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py
diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py