Enforce Groupby.__iter__ deprecation and miscellaneous pytest fixes (#13423)

galipremsagar · web-flow · commit 2dafcfcddf57 · 2023-05-26T15:39:17.000-05:00
This PR:

- [x] Enforces deprecation in `GroupBy.__iter__` 
- [x] Fixes miscellaneous pytest failures due to already existing differences in cudf vs pandas &amp; nuly introduced `inferred_type` in Index.
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -2314,7 +2314,16 @@ def as_column(
                         pa_type = np_to_pa_dtype(
                             _maybe_convert_to_default_type("float")
                         )
-
+                    if (
+                        pa_type is None
+                        and isinstance(arbitrary, pd.Index)
+                        and arbitrary.shape == (0,)
+                    ):
+                        # When an empty `pd.Index` is passed to `pa.array`,
+                        # a type of `null-type` is returned by pyarrow, hence
+                        # we need this workaround to preserve the dtype of
+                        # column being created.
+                        pa_type = np_to_pa_dtype(arbitrary.dtype)
                 data = as_column(
                     pa.array(
                         arbitrary,
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -5852,6 +5852,7 @@ def _reduce(
     ):
 
         source = self
+        axis = source._get_axis_from_axis_arg(axis)
         if numeric_only:
             numeric_cols = (
                 name
@@ -5860,9 +5861,11 @@ def _reduce(
             )
             source = self._get_columns_by_label(numeric_cols)
             if source.empty:
-                return Series(index=self.index)
-
-        axis = source._get_axis_from_axis_arg(axis)
+                return Series(
+                    index=self._data.to_pandas_index()[:0]
+                    if axis == 0
+                    else source.index
+                )
 
         if axis == 0:
             try:
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
@@ -272,19 +272,13 @@ def __init__(
             self.grouping = _Grouping(obj, by, level)
 
     def __iter__(self):
-        if isinstance(self._by, list) and len(self._by) == 1:
-            warnings.warn(
-                "In a future version of cudf, a length 1 tuple will be "
-                "returned when iterating over a groupby with a grouper equal "
-                "to a list of length 1. To avoid this warning, do not supply "
-                "a list with a single grouper.",
-                FutureWarning,
-            )
         group_names, offsets, _, grouped_values = self._grouped()
         if isinstance(group_names, cudf.BaseIndex):
             group_names = group_names.to_pandas()
         for i, name in enumerate(group_names):
-            yield name, grouped_values[offsets[i] : offsets[i + 1]]
+            yield (name,) if isinstance(self._by, list) and len(
+                self._by
+            ) == 1 else name, grouped_values[offsets[i] : offsets[i + 1]]
 
     @property
     def dtypes(self):
diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 
 import pandas as pd
@@ -7,6 +7,7 @@
 import cudf
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.testing._utils import assert_eq
+from cudf.core._compat import PANDAS_GE_200
 
 simple_test_data = [
     {},
@@ -52,7 +53,15 @@ def test_to_pandas_simple(simple_data):
     Test that a ColumnAccessor converts to a correct pd.Index
     """
     ca = ColumnAccessor(simple_data)
-    assert_eq(ca.to_pandas_index(), pd.DataFrame(simple_data).columns)
+    # We cannot return RangeIndex, while pandas returns RangeIndex.
+    # Pandas compares `inferred_type` which is `empty` for
+    # Index([], dtype='object'), and `integer` for RangeIndex()
+    # to ignore this `inferred_type` comparison, we pass exact=False.
+    assert_eq(
+        ca.to_pandas_index(),
+        pd.DataFrame(simple_data).columns,
+        exact=not PANDAS_GE_200,
+    )
 
 
 def test_to_pandas_multiindex(mi_data):
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
@@ -308,7 +308,7 @@ def test_axes(data):
     actual = csr.axes
 
     for e, a in zip(expected, actual):
-        assert_eq(e, a)
+        assert_eq(e, a, exact=not PANDAS_GE_200)
 
 
 def test_dataframe_truncate_axis_0():
@@ -4938,7 +4938,12 @@ def test_rowwise_ops(data, op, skipna, numeric_only):
         expected = getattr(pdf, op)(**kwargs)
         got = getattr(gdf, op)(**kwargs)
 
-        assert_eq(expected, got, check_dtype=False)
+        assert_eq(
+            expected,
+            got,
+            check_dtype=False,
+            check_index_type=False if len(got.index) == 0 else True,
+        )
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
@@ -33,7 +33,6 @@
     TIMEDELTA_TYPES,
     assert_eq,
     assert_exceptions_equal,
-    expect_warning_if,
 )
 from cudf.testing.dataset_generator import rand_dataframe
 
@@ -982,8 +981,7 @@ def test_groupby_unsupported_columns():
     )
     pdf["b"] = pd_cat
     gdf = cudf.from_pandas(pdf)
-    with pytest.warns(FutureWarning):
-        pdg = pdf.groupby("x").sum()
+    pdg = pdf.groupby("x").sum(numeric_only=True)
     # cudf does not yet support numeric_only, so our default is False (unlike
     # pandas, which defaults to inferring and throws a warning about it).
     gdg = gdf.groupby("x").sum()
@@ -1547,15 +1545,11 @@ def test_grouping(grouper):
     )
     gdf = cudf.from_pandas(pdf)
 
-    # There's no easy way to validate that the same warning is thrown by both
-    # cudf and pandas here because it's only thrown upon iteration, so we
-    # settle for catching warnings on the whole block.
-    with expect_warning_if(isinstance(grouper, list) and len(grouper) == 1):
-        for pdf_group, gdf_group in zip(
-            pdf.groupby(grouper), gdf.groupby(grouper)
-        ):
-            assert pdf_group[0] == gdf_group[0]
-            assert_eq(pdf_group[1], gdf_group[1])
+    for pdf_group, gdf_group in zip(
+        pdf.groupby(grouper), gdf.groupby(grouper)
+    ):
+        assert pdf_group[0] == gdf_group[0]
+        assert_eq(pdf_group[1], gdf_group[1])
 
 
 @pytest.mark.parametrize("agg", [lambda x: x.count(), "count"])
@@ -3311,8 +3305,8 @@ def test_head_tail_empty():
 
     expected = pdf.groupby(pd.Series(values)).head()
     got = df.groupby(cudf.Series(values)).head()
-    assert_eq(expected, got)
+    assert_eq(expected, got, check_column_type=not PANDAS_GE_200)
 
     expected = pdf.groupby(pd.Series(values)).tail()
     got = df.groupby(cudf.Series(values)).tail()
-    assert_eq(expected, got)
+    assert_eq(expected, got, check_column_type=not PANDAS_GE_200)
diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
@@ -8,7 +8,7 @@
 import pytest
 
 import cudf
-from cudf.core._compat import PANDAS_GE_134, PANDAS_GE_150
+from cudf.core._compat import PANDAS_GE_134, PANDAS_GE_150, PANDAS_GE_200
 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype
 from cudf.testing._utils import (
     INTEGER_TYPES,
@@ -1008,8 +1008,9 @@ def test_numeric_series_replace_dtype(series_dtype, replacement):
             pd.Series(["one", "two", "three"], dtype="category"),
             {"to_replace": "one", "value": "two", "inplace": True},
             marks=pytest.mark.xfail(
-                condition=not PANDAS_GE_134,
-                reason="https://github.com/pandas-dev/pandas/issues/43232",
+                condition=(not PANDAS_GE_134) or (PANDAS_GE_200),
+                reason="https://github.com/pandas-dev/pandas/issues/43232"
+                "https://github.com/pandas-dev/pandas/issues/53358",
             ),
         ),
         (