add whatsnew

simonariddell · simonariddell · commit 47b5908deeb1 · 2018-05-28T16:08:05.000-07:00
diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt
@@ -50,6 +50,7 @@ Groupby/Resample/Rolling
 ^^^^^^^^^^^^^^^^^^^^^^^^
 
 - Bug in :func:`DataFrame.agg` where applying multiple aggregation functions to a :class:`DataFrame` with duplicated column names would cause a stack overflow (:issue:`21063`)
+- Bug in `DataFrame.pct_change() and `Series.pct_change() where percent change on non-monotonic groups were calculated in a vectorized way (:issue:`21200`)
 
 Strings
 ^^^^^^^
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -2070,7 +2070,7 @@ def shift(self, periods=1, freq=None, axis=0):
     def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None,
                    axis=0):
         """Calcuate pct_change of each value to previous entry in group"""
-        if freq is not None or axis != 0:
+        if (freq is not None or axis != 0) or not self.grouper.is_monotonic:
             return self.apply(lambda x: x.pct_change(periods=periods,
                                                      fill_method=fill_method,
                                                      limit=limit, freq=freq,
@@ -3942,7 +3942,12 @@ def _apply_to_column_groupbys(self, func):
         return func(self)
 
     def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None):
-        """Calculate percent change of each value to previous entry in group"""
+        """Calcuate pct_change of each value to previous entry in group"""
+        if not self.grouper.is_monotonic:
+            return self.apply(lambda x: x.pct_change(periods=periods,
+                                                     fill_method=fill_method,
+                                                     limit=limit, freq=freq))
+
         filled = getattr(self, fill_method)(limit=limit)
         shifted = filled.shift(periods=periods, freq=freq)
 
diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py
@@ -722,35 +722,52 @@ def interweave(list_obj):
 
 
 @pytest.mark.parametrize("test_series", [True, False])
+@pytest.mark.parametrize("shuffle", [True, False])
 @pytest.mark.parametrize("periods,fill_method,limit", [
     (1, 'ffill', None), (1, 'ffill', 1),
     (1, 'bfill', None), (1, 'bfill', 1),
     (-1, 'ffill', None), (-1, 'ffill', 1),
     (-1, 'bfill', None), (-1, 'bfill', 1)])
-def test_pct_change(test_series, periods, fill_method, limit):
+def test_pct_change(test_series, shuffle, periods, fill_method, limit):
+    # Groupby pct change uses an apply if monotonic and a vectorized operation if non-monotonic
+    # Shuffle parameter tests each
     vals = [np.nan, np.nan, 1, 2, 4, 10, np.nan, np.nan]
-    exp_vals = Series(vals).pct_change(periods=periods,
-                                       fill_method=fill_method,
-                                       limit=limit).tolist()
-
-    df = DataFrame({'key': ['a'] * len(vals) + ['b'] * len(vals),
+    keys = ['a', 'b']
+    df = DataFrame({'key': [k for j in list(map(lambda x: [x] * len(vals), keys)) for k in j],
                     'vals': vals * 2})
+    if shuffle:
+        df = df.reindex(np.random.permutation(len(df))).reset_index(drop=True)
+
+    manual_apply = []
+    for k in keys:
+        manual_apply.append(Series(df.loc[df.key == k, 'vals'].values).pct_change(periods=periods,
+                                                                                  fill_method=fill_method,
+                                                                                  limit=limit))
+    exp_vals = pd.concat(manual_apply).reset_index(drop=True)
+    exp = pd.DataFrame(exp_vals, columns=['_pct_change'])
     grp = df.groupby('key')
 
     def get_result(grp_obj):
         return grp_obj.pct_change(periods=periods,
                                   fill_method=fill_method,
                                   limit=limit)
 
+    # Specifically test when monotonic and not monotonic
+
     if test_series:
-        exp = pd.Series(exp_vals * 2)
-        exp.name = 'vals'
+        exp = exp.loc[:, '_pct_change']
         grp = grp['vals']
         result = get_result(grp)
+        # Resort order by keys to compare to expected values
+        df.insert(0, '_pct_change', result)
+        result = df.sort_values(by='key')
+        result = result.loc[:, '_pct_change']
+        result = result.reset_index(drop=True)
         tm.assert_series_equal(result, exp)
     else:
-        exp = DataFrame({'vals': exp_vals * 2})
         result = get_result(grp)
+        result.reset_index(drop=True, inplace=True)
+        result.columns = ['_pct_change']
         tm.assert_frame_equal(result, exp)