Skip to content

Commit 7d52970

Browse files
authored
Support numeric_only field for rank() (#7213)
Closes #7174 This PR adds support for `numeric_only` field for `Dataframe.rank()` and `Series.rank()`. When user specifies `numeric_only=True`, only the numerical data type columns are selected to construct a cudf object and passed to lower level for processing. Two minor refactors are also included in this PR: - This PR refactors internal API of `Frame._get_columns_by_label`, which now supports dispatching to this method from both `Dataframe` and `Series`. - This PR refactors `test_rank.py`, moving test functions inside class `TestRank` out as top level functions. All test variables shared among test cases are moved to a `pytests.fixture` method. A `Dataframe.rank` test case that expects to raise due to a [pandas bug](pandas-dev/pandas#32593) is now captured under `pytest.raises`. Authors: - Michael Wang (@isVoid) Approvers: - Ashwin Srinath (@shwina) - @brandon-b-miller URL: #7213
1 parent cbc0394 commit 7d52970

File tree

4 files changed

+183
-161
lines changed

4 files changed

+183
-161
lines changed

python/cudf/cudf/core/dataframe.py

+20
Original file line numberDiff line numberDiff line change
@@ -1318,6 +1318,26 @@ def _repr_html_(self):
13181318
def _repr_latex_(self):
13191319
return self._get_renderable_dataframe().to_pandas()._repr_latex_()
13201320

1321+
def _get_columns_by_label(self, labels, downcast=False):
1322+
"""
1323+
Return columns of dataframe by `labels`
1324+
1325+
If downcast is True, try and downcast from a DataFrame to a Series
1326+
"""
1327+
new_data = super()._get_columns_by_label(labels, downcast)
1328+
if downcast:
1329+
if is_scalar(labels):
1330+
nlevels = 1
1331+
elif isinstance(labels, tuple):
1332+
nlevels = len(labels)
1333+
if self._data.multiindex is False or nlevels == self._data.nlevels:
1334+
return self._constructor_sliced(
1335+
new_data, name=labels, index=self.index
1336+
)
1337+
return self._constructor(
1338+
new_data, columns=new_data.to_pandas_index(), index=self.index
1339+
)
1340+
13211341
# unary, binary, rbinary, orderedcompare, unorderedcompare
13221342
def _apply_op(self, fn, other=None, fill_value=None):
13231343

python/cudf/cudf/core/frame.py

+11-18
Original file line numberDiff line numberDiff line change
@@ -483,25 +483,12 @@ def equals(self, other, **kwargs):
483483
else:
484484
return self._index.equals(other._index)
485485

486-
def _get_columns_by_label(self, labels, downcast=False):
486+
def _get_columns_by_label(self, labels, downcast):
487487
"""
488488
Returns columns of the Frame specified by `labels`
489489
490-
If downcast is True, try and downcast from a DataFrame to a Series
491-
"""
492-
new_data = self._data.select_by_label(labels)
493-
if downcast:
494-
if is_scalar(labels):
495-
nlevels = 1
496-
elif isinstance(labels, tuple):
497-
nlevels = len(labels)
498-
if self._data.multiindex is False or nlevels == self._data.nlevels:
499-
return self._constructor_sliced(
500-
new_data, name=labels, index=self.index
501-
)
502-
return self._constructor(
503-
new_data, columns=new_data.to_pandas_index(), index=self.index
504-
)
490+
"""
491+
return self._data.select_by_label(labels)
505492

506493
def _get_columns_by_index(self, indices):
507494
"""
@@ -1643,10 +1630,16 @@ def rank(
16431630
"na_option must be one of 'keep', 'top', or 'bottom'"
16441631
)
16451632

1646-
# TODO code for selecting numeric columns
16471633
source = self
16481634
if numeric_only:
1649-
warnings.warn("numeric_only=True is not implemented yet")
1635+
numeric_cols = (
1636+
name
1637+
for name in self._data.names
1638+
if is_numerical_dtype(self._data[name])
1639+
)
1640+
source = self._get_columns_by_label(numeric_cols)
1641+
if source.empty:
1642+
return source.astype("float64")
16501643

16511644
out_rank_table = libcudf.sort.rank_columns(
16521645
source, method_enum, na_option, ascending, pct

python/cudf/cudf/core/series.py

+14
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,20 @@ def _copy_construct(self, **kwargs):
405405
params.update(kwargs)
406406
return cls(**params)
407407

408+
def _get_columns_by_label(self, labels, downcast=False):
409+
"""Return the column specified by `labels`
410+
411+
For cudf.Series, either the column, or an empty series is returned.
412+
Parameter `downcast` does not have effects.
413+
"""
414+
new_data = super()._get_columns_by_label(labels, downcast)
415+
416+
return (
417+
self._constructor(data=new_data, index=self.index)
418+
if len(new_data) > 0
419+
else self._constructor(dtype=self.dtype, name=self.name)
420+
)
421+
408422
@classmethod
409423
def from_arrow(cls, array):
410424
"""

python/cudf/cudf/tests/test_rank.py

+138-143
Original file line numberDiff line numberDiff line change
@@ -10,152 +10,147 @@
1010
from cudf.tests.utils import assert_eq, assert_exceptions_equal
1111

1212

13-
class TestRank:
14-
index = np.array([5, 4, 3, 2, 1, 6, 7, 8, 9, 10])
15-
col1 = np.array([5, 4, 3, 5, 8, 5, 2, 1, 6, 6])
16-
col2 = np.array([5, 4, np.nan, 5, 8, 5, np.inf, np.nan, 6, -np.inf])
17-
18-
@pytest.mark.parametrize("dtype", ["O", "f8", "i4"])
19-
@pytest.mark.parametrize("ascending", [True, False])
20-
@pytest.mark.parametrize(
21-
"method", ["average", "min", "max", "first", "dense"]
22-
)
23-
@pytest.mark.parametrize("na_option", ["keep", "top", "bottom"])
24-
@pytest.mark.parametrize("pct", [True, False])
25-
def test_rank_all_arguments(
26-
self, dtype, ascending, method, na_option, pct
27-
):
28-
if method == "first" and dtype == "O":
29-
# not supported by pandas
30-
return
31-
pdf = pd.DataFrame(index=self.index)
32-
pdf["col1"] = self.col1.astype(dtype)
33-
pdf["col2"] = self.col2.astype(dtype)
34-
gdf = DataFrame.from_pandas(pdf)
35-
36-
def _check(gs, ps, method, na_option, ascending, pct):
37-
ranked_gs = gs.rank(
38-
method=method,
39-
na_option=na_option,
40-
ascending=ascending,
41-
pct=pct,
42-
)
43-
ranked_ps = ps.rank(
44-
method=method,
45-
na_option=na_option,
46-
ascending=ascending,
47-
pct=pct,
48-
)
49-
assert_eq(ranked_ps, ranked_gs.to_pandas())
50-
51-
# # Series
52-
_check(
53-
gdf["col1"],
54-
pdf["col1"],
55-
method=method,
56-
na_option=na_option,
57-
ascending=ascending,
58-
pct=pct,
59-
)
60-
_check(
61-
gdf["col2"],
62-
pdf["col2"],
63-
method=method,
64-
na_option=na_option,
65-
ascending=ascending,
66-
pct=pct,
67-
)
68-
# TODO: https://github.com/pandas-dev/pandas/issues/32593
69-
# Dataframe (bug in pandas)
70-
# _check(
71-
# gdf,
72-
# pdf,
73-
# method=method,
74-
# na_option=na_option,
75-
# ascending=ascending,
76-
# pct=pct,
77-
# )
78-
79-
def test_rank_error_arguments(self):
80-
pdf = pd.DataFrame(index=self.index)
81-
pdf["col1"] = self.col1
82-
pdf["col2"] = self.col2
83-
gdf = DataFrame.from_pandas(pdf)
84-
85-
assert_exceptions_equal(
86-
lfunc=pdf["col1"].rank,
87-
rfunc=gdf["col1"].rank,
88-
lfunc_args_and_kwargs=(
89-
[],
90-
{
91-
"method": "randomname",
92-
"na_option": "keep",
93-
"ascending": True,
94-
"pct": True,
95-
},
13+
@pytest.fixture
14+
def pdf():
15+
return pd.DataFrame(
16+
{
17+
"col1": np.array([5, 4, 3, 5, 8, 5, 2, 1, 6, 6]),
18+
"col2": np.array(
19+
[5, 4, np.nan, 5, 8, 5, np.inf, np.nan, 6, -np.inf]
9620
),
97-
rfunc_args_and_kwargs=(
98-
[],
99-
{
100-
"method": "randomname",
101-
"na_option": "keep",
102-
"ascending": True,
103-
"pct": True,
104-
},
105-
),
106-
)
21+
},
22+
index=np.array([5, 4, 3, 2, 1, 6, 7, 8, 9, 10]),
23+
)
10724

108-
assert_exceptions_equal(
109-
lfunc=pdf["col1"].rank,
110-
rfunc=gdf["col1"].rank,
111-
lfunc_args_and_kwargs=(
112-
[],
113-
{
114-
"method": "first",
115-
"na_option": "randomname",
116-
"ascending": True,
117-
"pct": True,
118-
},
119-
),
120-
rfunc_args_and_kwargs=(
121-
[],
122-
{
123-
"method": "first",
124-
"na_option": "randomname",
125-
"ascending": True,
126-
"pct": True,
127-
},
128-
),
25+
26+
@pytest.mark.parametrize("dtype", ["O", "f8", "i4"])
27+
@pytest.mark.parametrize("ascending", [True, False])
28+
@pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"])
29+
@pytest.mark.parametrize("na_option", ["keep", "top", "bottom"])
30+
@pytest.mark.parametrize("pct", [True, False])
31+
@pytest.mark.parametrize("numeric_only", [True, False])
32+
def test_rank_all_arguments(
33+
pdf, dtype, ascending, method, na_option, pct, numeric_only
34+
):
35+
if method == "first" and dtype == "O":
36+
# not supported by pandas
37+
return
38+
39+
pdf = pdf.copy(deep=True) # for parallel pytest
40+
if numeric_only:
41+
pdf["str"] = np.array(
42+
["a", "b", "c", "d", "e", "1", "2", "3", "4", "5"]
12943
)
44+
gdf = DataFrame.from_pandas(pdf)
45+
46+
kwargs = {
47+
"method": method,
48+
"na_option": na_option,
49+
"ascending": ascending,
50+
"pct": pct,
51+
"numeric_only": numeric_only,
52+
}
53+
54+
# Series
55+
assert_eq(gdf["col1"].rank(**kwargs), pdf["col1"].rank(**kwargs))
56+
assert_eq(gdf["col2"].rank(**kwargs), pdf["col2"].rank(**kwargs))
57+
if numeric_only:
58+
expect = pdf["str"].rank(**kwargs)
59+
got = gdf["str"].rank(**kwargs)
60+
assert expect.empty == got.empty
61+
62+
# TODO: https://github.com/pandas-dev/pandas/issues/32593
63+
# Dataframe (bug in pandas)
64+
if (
65+
na_option == "top"
66+
and method == "first"
67+
and not dtype == "O"
68+
and ascending
69+
):
70+
assert_eq(gdf.rank(**kwargs), pdf.rank(**kwargs))
71+
else:
72+
with pytest.raises(AssertionError, match="values are different"):
73+
assert_eq(gdf.rank(**kwargs), pdf.rank(**kwargs))
74+
75+
76+
def test_rank_error_arguments(pdf):
77+
gdf = DataFrame.from_pandas(pdf)
78+
79+
assert_exceptions_equal(
80+
lfunc=pdf["col1"].rank,
81+
rfunc=gdf["col1"].rank,
82+
lfunc_args_and_kwargs=(
83+
[],
84+
{
85+
"method": "randomname",
86+
"na_option": "keep",
87+
"ascending": True,
88+
"pct": True,
89+
},
90+
),
91+
rfunc_args_and_kwargs=(
92+
[],
93+
{
94+
"method": "randomname",
95+
"na_option": "keep",
96+
"ascending": True,
97+
"pct": True,
98+
},
99+
),
100+
)
130101

131-
sort_group_args = [
132-
np.full((3,), np.nan),
133-
100 * np.random.random(10),
134-
np.full((3,), np.inf),
135-
np.full((3,), -np.inf),
136-
]
137-
sort_dtype_args = [np.int32, np.float32, np.float64]
138-
# TODO: np.int64, disabled because of bug
139-
# https://github.com/pandas-dev/pandas/issues/32859
140-
141-
@pytest.mark.parametrize(
142-
"elem,dtype",
143-
list(
144-
product(
145-
combinations_with_replacement(sort_group_args, 4),
146-
sort_dtype_args,
147-
)
102+
assert_exceptions_equal(
103+
lfunc=pdf["col1"].rank,
104+
rfunc=gdf["col1"].rank,
105+
lfunc_args_and_kwargs=(
106+
[],
107+
{
108+
"method": "first",
109+
"na_option": "randomname",
110+
"ascending": True,
111+
"pct": True,
112+
},
148113
),
114+
rfunc_args_and_kwargs=(
115+
[],
116+
{
117+
"method": "first",
118+
"na_option": "randomname",
119+
"ascending": True,
120+
"pct": True,
121+
},
122+
),
123+
)
124+
125+
126+
sort_group_args = [
127+
np.full((3,), np.nan),
128+
100 * np.random.random(10),
129+
np.full((3,), np.inf),
130+
np.full((3,), -np.inf),
131+
]
132+
sort_dtype_args = [np.int32, np.float32, np.float64]
133+
# TODO: np.int64, disabled because of bug
134+
# https://github.com/pandas-dev/pandas/issues/32859
135+
136+
137+
@pytest.mark.parametrize(
138+
"elem,dtype",
139+
list(
140+
product(
141+
combinations_with_replacement(sort_group_args, 4), sort_dtype_args,
142+
)
143+
),
144+
)
145+
def test_series_rank_combinations(elem, dtype):
146+
np.random.seed(0)
147+
gdf = DataFrame()
148+
gdf["a"] = aa = np.fromiter(chain.from_iterable(elem), np.float64).astype(
149+
dtype
149150
)
150-
def test_series_rank_combinations(self, elem, dtype):
151-
np.random.seed(0)
152-
gdf = DataFrame()
153-
gdf["a"] = aa = np.fromiter(
154-
chain.from_iterable(elem), np.float64
155-
).astype(dtype)
156-
ranked_gs = gdf["a"].rank(method="first")
157-
df = pd.DataFrame()
158-
df["a"] = aa
159-
ranked_ps = df["a"].rank(method="first")
160-
# Check
161-
assert_eq(ranked_ps, ranked_gs.to_pandas())
151+
ranked_gs = gdf["a"].rank(method="first")
152+
df = pd.DataFrame()
153+
df["a"] = aa
154+
ranked_ps = df["a"].rank(method="first")
155+
# Check
156+
assert_eq(ranked_ps, ranked_gs.to_pandas())

0 commit comments

Comments
 (0)