Skip to content

Commit 2dafcfc

Browse files
Enforce Groupby.__iter__ deprecation and miscellaneous pytest fixes (#13423)
This PR: - [x] Enforces deprecation in `GroupBy.__iter__` - [x] Fixes miscellaneous pytest failures due to already existing differences in cudf vs pandas & nuly introduced `inferred_type` in Index.
1 parent c1e78b9 commit 2dafcfc

File tree

7 files changed

+49
-34
lines changed

7 files changed

+49
-34
lines changed

python/cudf/cudf/core/column/column.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -2314,7 +2314,16 @@ def as_column(
23142314
pa_type = np_to_pa_dtype(
23152315
_maybe_convert_to_default_type("float")
23162316
)
2317-
2317+
if (
2318+
pa_type is None
2319+
and isinstance(arbitrary, pd.Index)
2320+
and arbitrary.shape == (0,)
2321+
):
2322+
# When an empty `pd.Index` is passed to `pa.array`,
2323+
# a type of `null-type` is returned by pyarrow, hence
2324+
# we need this workaround to preserve the dtype of
2325+
# column being created.
2326+
pa_type = np_to_pa_dtype(arbitrary.dtype)
23182327
data = as_column(
23192328
pa.array(
23202329
arbitrary,

python/cudf/cudf/core/dataframe.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -5852,6 +5852,7 @@ def _reduce(
58525852
):
58535853

58545854
source = self
5855+
axis = source._get_axis_from_axis_arg(axis)
58555856
if numeric_only:
58565857
numeric_cols = (
58575858
name
@@ -5860,9 +5861,11 @@ def _reduce(
58605861
)
58615862
source = self._get_columns_by_label(numeric_cols)
58625863
if source.empty:
5863-
return Series(index=self.index)
5864-
5865-
axis = source._get_axis_from_axis_arg(axis)
5864+
return Series(
5865+
index=self._data.to_pandas_index()[:0]
5866+
if axis == 0
5867+
else source.index
5868+
)
58665869

58675870
if axis == 0:
58685871
try:

python/cudf/cudf/core/groupby/groupby.py

+3-9
Original file line numberDiff line numberDiff line change
@@ -272,19 +272,13 @@ def __init__(
272272
self.grouping = _Grouping(obj, by, level)
273273

274274
def __iter__(self):
275-
if isinstance(self._by, list) and len(self._by) == 1:
276-
warnings.warn(
277-
"In a future version of cudf, a length 1 tuple will be "
278-
"returned when iterating over a groupby with a grouper equal "
279-
"to a list of length 1. To avoid this warning, do not supply "
280-
"a list with a single grouper.",
281-
FutureWarning,
282-
)
283275
group_names, offsets, _, grouped_values = self._grouped()
284276
if isinstance(group_names, cudf.BaseIndex):
285277
group_names = group_names.to_pandas()
286278
for i, name in enumerate(group_names):
287-
yield name, grouped_values[offsets[i] : offsets[i + 1]]
279+
yield (name,) if isinstance(self._by, list) and len(
280+
self._by
281+
) == 1 else name, grouped_values[offsets[i] : offsets[i + 1]]
288282

289283
@property
290284
def dtypes(self):

python/cudf/cudf/tests/test_column_accessor.py

+11-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2020, NVIDIA CORPORATION.
1+
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
22

33

44
import pandas as pd
@@ -7,6 +7,7 @@
77
import cudf
88
from cudf.core.column_accessor import ColumnAccessor
99
from cudf.testing._utils import assert_eq
10+
from cudf.core._compat import PANDAS_GE_200
1011

1112
simple_test_data = [
1213
{},
@@ -52,7 +53,15 @@ def test_to_pandas_simple(simple_data):
5253
Test that a ColumnAccessor converts to a correct pd.Index
5354
"""
5455
ca = ColumnAccessor(simple_data)
55-
assert_eq(ca.to_pandas_index(), pd.DataFrame(simple_data).columns)
56+
# We cannot return RangeIndex, while pandas returns RangeIndex.
57+
# Pandas compares `inferred_type` which is `empty` for
58+
# Index([], dtype='object'), and `integer` for RangeIndex()
59+
# to ignore this `inferred_type` comparison, we pass exact=False.
60+
assert_eq(
61+
ca.to_pandas_index(),
62+
pd.DataFrame(simple_data).columns,
63+
exact=not PANDAS_GE_200,
64+
)
5665

5766

5867
def test_to_pandas_multiindex(mi_data):

python/cudf/cudf/tests/test_dataframe.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -308,7 +308,7 @@ def test_axes(data):
308308
actual = csr.axes
309309

310310
for e, a in zip(expected, actual):
311-
assert_eq(e, a)
311+
assert_eq(e, a, exact=not PANDAS_GE_200)
312312

313313

314314
def test_dataframe_truncate_axis_0():
@@ -4938,7 +4938,12 @@ def test_rowwise_ops(data, op, skipna, numeric_only):
49384938
expected = getattr(pdf, op)(**kwargs)
49394939
got = getattr(gdf, op)(**kwargs)
49404940

4941-
assert_eq(expected, got, check_dtype=False)
4941+
assert_eq(
4942+
expected,
4943+
got,
4944+
check_dtype=False,
4945+
check_index_type=False if len(got.index) == 0 else True,
4946+
)
49424947

49434948

49444949
@pytest.mark.parametrize(

python/cudf/cudf/tests/test_groupby.py

+8-14
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333
TIMEDELTA_TYPES,
3434
assert_eq,
3535
assert_exceptions_equal,
36-
expect_warning_if,
3736
)
3837
from cudf.testing.dataset_generator import rand_dataframe
3938

@@ -982,8 +981,7 @@ def test_groupby_unsupported_columns():
982981
)
983982
pdf["b"] = pd_cat
984983
gdf = cudf.from_pandas(pdf)
985-
with pytest.warns(FutureWarning):
986-
pdg = pdf.groupby("x").sum()
984+
pdg = pdf.groupby("x").sum(numeric_only=True)
987985
# cudf does not yet support numeric_only, so our default is False (unlike
988986
# pandas, which defaults to inferring and throws a warning about it).
989987
gdg = gdf.groupby("x").sum()
@@ -1547,15 +1545,11 @@ def test_grouping(grouper):
15471545
)
15481546
gdf = cudf.from_pandas(pdf)
15491547

1550-
# There's no easy way to validate that the same warning is thrown by both
1551-
# cudf and pandas here because it's only thrown upon iteration, so we
1552-
# settle for catching warnings on the whole block.
1553-
with expect_warning_if(isinstance(grouper, list) and len(grouper) == 1):
1554-
for pdf_group, gdf_group in zip(
1555-
pdf.groupby(grouper), gdf.groupby(grouper)
1556-
):
1557-
assert pdf_group[0] == gdf_group[0]
1558-
assert_eq(pdf_group[1], gdf_group[1])
1548+
for pdf_group, gdf_group in zip(
1549+
pdf.groupby(grouper), gdf.groupby(grouper)
1550+
):
1551+
assert pdf_group[0] == gdf_group[0]
1552+
assert_eq(pdf_group[1], gdf_group[1])
15591553

15601554

15611555
@pytest.mark.parametrize("agg", [lambda x: x.count(), "count"])
@@ -3311,8 +3305,8 @@ def test_head_tail_empty():
33113305

33123306
expected = pdf.groupby(pd.Series(values)).head()
33133307
got = df.groupby(cudf.Series(values)).head()
3314-
assert_eq(expected, got)
3308+
assert_eq(expected, got, check_column_type=not PANDAS_GE_200)
33153309

33163310
expected = pdf.groupby(pd.Series(values)).tail()
33173311
got = df.groupby(cudf.Series(values)).tail()
3318-
assert_eq(expected, got)
3312+
assert_eq(expected, got, check_column_type=not PANDAS_GE_200)

python/cudf/cudf/tests/test_replace.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import pytest
99

1010
import cudf
11-
from cudf.core._compat import PANDAS_GE_134, PANDAS_GE_150
11+
from cudf.core._compat import PANDAS_GE_134, PANDAS_GE_150, PANDAS_GE_200
1212
from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype
1313
from cudf.testing._utils import (
1414
INTEGER_TYPES,
@@ -1008,8 +1008,9 @@ def test_numeric_series_replace_dtype(series_dtype, replacement):
10081008
pd.Series(["one", "two", "three"], dtype="category"),
10091009
{"to_replace": "one", "value": "two", "inplace": True},
10101010
marks=pytest.mark.xfail(
1011-
condition=not PANDAS_GE_134,
1012-
reason="https://github.com/pandas-dev/pandas/issues/43232",
1011+
condition=(not PANDAS_GE_134) or (PANDAS_GE_200),
1012+
reason="https://github.com/pandas-dev/pandas/issues/43232"
1013+
"https://github.com/pandas-dev/pandas/issues/53358",
10131014
),
10141015
),
10151016
(

0 commit comments

Comments
 (0)