PERF: add the 'name' attribute to dataframes that go through apply_frame_axis0

Joe Jevnik · mattip · commit 6c3d1f573a87 · 2017-03-30T23:12:38.000+03:00
Previously, if you did `group.name` in the applied function, it would fail and fall back to the slower path because the attribute did not exist; `shape_before` was unused. Author: Joe Jevnik <joe@quantopian.com> This patch had conflicts when merged, resolved by Committer: Jeff Reback <jeff@reback.net> Closes pandas-dev#15062 from llllllllll/add-name-in-apply-inference-call and squashes the following commits: 722a945 [Joe Jevnik] DOC: update whatsnew for groupby perf change 7e75635 [Joe Jevnik] DEV: add groupby asv benchmark 710528a [Joe Jevnik] BUG: add the 'name' attribute to dataframes that go through apply_frame_axis0
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -108,16 +108,34 @@ def setup(self):
         self.N = 10000
         self.labels = np.random.randint(0, 2000, size=self.N)
         self.labels2 = np.random.randint(0, 3, size=self.N)
-        self.df = DataFrame({'key': self.labels, 'key2': self.labels2, 'value1': randn(self.N), 'value2': (['foo', 'bar', 'baz', 'qux'] * (self.N / 4)), })
-
-    def f(self, g):
+        self.df = DataFrame({
+            'key': self.labels,
+            'key2': self.labels2,
+            'value1': np.random.randn(self.N),
+            'value2': (['foo', 'bar', 'baz', 'qux'] * (self.N // 4)),
+        })
+
+    @staticmethod
+    def scalar_function(g):
         return 1
 
-    def time_groupby_frame_apply(self):
-        self.df.groupby(['key', 'key2']).apply(self.f)
+    def time_groupby_frame_apply_scalar_function(self):
+        self.df.groupby(['key', 'key2']).apply(self.scalar_function)
+
+    def time_groupby_frame_apply_scalar_function_overhead(self):
+        self.df.groupby('key').apply(self.scalar_function)
+
+    @staticmethod
+    def df_copy_function(g):
+        # ensure that the group name is available (see GH #15062)
+        g.name
+        return g.copy()
+
+    def time_groupby_frame_df_copy_function(self):
+        self.df.groupby(['key', 'key2']).apply(self.df_copy_function)
 
-    def time_groupby_frame_apply_overhead(self):
-        self.df.groupby('key').apply(self.f)
+    def time_groupby_frame_apply_df_copy_overhead(self):
+        self.df.groupby('key').apply(self.df_copy_function)
 
 
 #----------------------------------------------------------------------
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -831,6 +831,9 @@ Performance Improvements
 - Improved performance when using ``.unstack()`` (:issue:`15503`)
 - Improved performance of merge/join on ``category`` columns (:issue:`10409`)
 - Improved performance of ``drop_duplicates()`` on ``bool`` columns (:issue:`12963`)
+- Improve performance of ``pd.core.groupby.GroupBy.apply`` when the applied
+  function used the ``.name`` attribute of the group DataFrame (:issue:`15062`).
+
 
 
 .. _whatsnew_0200.bug_fixes:
diff --git a/pandas/_libs/src/reduce.pyx b/pandas/_libs/src/reduce.pyx
@@ -497,7 +497,7 @@ def apply_frame_axis0(object frame, object f, object names,
     # Need to infer if our low-level mucking is going to cause a segfault
     if n > 0:
         chunk = frame.iloc[starts[0]:ends[0]]
-        shape_before = chunk.shape
+        object.__setattr__(chunk, 'name', names[0])
         try:
             result = f(chunk)
             if result is chunk:
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -3244,6 +3244,22 @@ def _check_all(grouped):
         _check_all(self.df.groupby('A'))
         _check_all(self.df.groupby(['A', 'B']))
 
+    def test_group_name_available_in_inference_pass(self):
+        # gh-15062
+        df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)})
+
+        names = []
+
+        def f(group):
+            names.append(group.name)
+            return group.copy()
+
+        df.groupby('a', sort=False, group_keys=False).apply(f)
+        # we expect 2 zeros because we call ``f`` once to see if a faster route
+        # can be used.
+        expected_names = [0, 0, 1, 2]
+        tm.assert_equal(names, expected_names)
+
     def test_no_dummy_key_names(self):
         # GH #1291