Preserve Index and grouped columns in Groupby.nth (#13442)

galipremsagar · web-flow · commit 16c987e2051e · 2023-05-30T08:41:25.000-05:00
In pandas-2.0 `groupby.nth` behavior has changed: https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#dataframegroupby-nth-and-seriesgroupby-nth-now-behave-as-filtrations This PR enables preserving the callers index in the end result and returns grouping columns as part of the result. This PR fixes all 12 pytests in `python/cudf/cudf/tests/test_groupby.py::test_groupby_nth`
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
@@ -802,10 +802,21 @@ def nth(self, n):
         """
         Return the nth row from each group.
         """
-        result = self.agg(lambda x: x.nth(n)).sort_index()
-        sizes = self.size().sort_index()
 
-        return result[sizes > n]
+        self.obj["__groupbynth_order__"] = range(0, len(self.obj))
+        # We perform another groupby here to have the grouping columns
+        # be a part of dataframe columns.
+        result = self.obj.groupby(self.grouping.keys).agg(lambda x: x.nth(n))
+        sizes = self.size().reindex(result.index)
+
+        result = result[sizes > n]
+
+        result._index = self.obj.index.take(
+            result._data["__groupbynth_order__"]
+        )
+        del result._data["__groupbynth_order__"]
+        del self.obj._data["__groupbynth_order__"]
+        return result
 
     @_cudf_nvtx_annotate
     def ngroup(self, ascending=True):