Fix type mismatch in groupby reduction for empty objects (#13942)

galipremsagar · web-flow · commit 6ed42d7c26ab · 2023-08-24T01:32:54.000Z
closes #13941 This PR preserves the column types, for group by reduction operations that are performed on empty objects. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Bradley Dice (https://github.com/bdice) URL: #13942
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
@@ -553,8 +553,8 @@ def agg(self, func):
             orig_dtypes,
         ):
             for agg, col in zip(aggs, cols):
+                agg_name = agg.__name__ if callable(agg) else agg
                 if multilevel:
-                    agg_name = agg.__name__ if callable(agg) else agg
                     key = (col_name, agg_name)
                 else:
                     key = col_name
@@ -564,7 +564,26 @@ def agg(self, func):
                 ):
                     # Structs lose their labels which we reconstruct here
                     col = col._with_type_metadata(cudf.ListDtype(orig_dtype))
-                data[key] = col
+
+                if (
+                    self.obj.empty
+                    and (
+                        isinstance(agg_name, str)
+                        and agg_name in Reducible._SUPPORTED_REDUCTIONS
+                    )
+                    and len(col) == 0
+                    and not isinstance(
+                        col,
+                        (
+                            cudf.core.column.ListColumn,
+                            cudf.core.column.StructColumn,
+                            cudf.core.column.DecimalBaseColumn,
+                        ),
+                    )
+                ):
+                    data[key] = col.astype(orig_dtype)
+                else:
+                    data[key] = col
         data = ColumnAccessor(data, multiindex=multilevel)
         if not multilevel:
             data = data.rename_levels({np.nan: None}, level=0)
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
@@ -3342,6 +3342,33 @@ def test_group_by_pandas_sort_order(groups, sort):
         )
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    ["int32", "int64", "float64", "datetime64[ns]", "timedelta64[ns]", "bool"],
+)
+@pytest.mark.parametrize(
+    "reduce_op",
+    [
+        "min",
+        "max",
+        "idxmin",
+        "idxmax",
+        "first",
+        "last",
+    ],
+)
+def test_group_by_empty_reduction(dtype, reduce_op):
+    gdf = cudf.DataFrame({"a": [], "b": [], "c": []}, dtype=dtype)
+    pdf = gdf.to_pandas()
+
+    gg = gdf.groupby("a")["c"]
+    pg = pdf.groupby("a")["c"]
+
+    assert_eq(
+        getattr(gg, reduce_op)(), getattr(pg, reduce_op)(), check_dtype=True
+    )
+
+
 @pytest.mark.parametrize(
     "dtype",
     ["int32", "int64", "float64", "datetime64[ns]", "timedelta64[ns]", "bool"],
@@ -3357,6 +3384,7 @@ def test_group_by_empty_apply(request, dtype, apply_op):
             reason=("sum isn't supported for datetime64[ns]"),
         )
     )
+
     gdf = cudf.DataFrame({"a": [], "b": [], "c": []}, dtype=dtype)
     pdf = gdf.to_pandas()