Support numeric_only field for rank() (#7213)

isVoid · web-flow · commit 7d52970a37e7 · 2021-01-28T02:19:49.000Z
Closes #7174 This PR adds support for `numeric_only` field for `Dataframe.rank()` and `Series.rank()`. When user specifies `numeric_only=True`, only the numerical data type columns are selected to construct a cudf object and passed to lower level for processing. Two minor refactors are also included in this PR: - This PR refactors internal API of `Frame._get_columns_by_label`, which now supports dispatching to this method from both `Dataframe` and `Series`. - This PR refactors `test_rank.py`, moving test functions inside class `TestRank` out as top level functions. All test variables shared among test cases are moved to a `pytests.fixture` method. A `Dataframe.rank` test case that expects to raise due to a [pandas bug](pandas-dev/pandas#32593) is now captured under `pytest.raises`. Authors: - Michael Wang (@isVoid) Approvers: - Ashwin Srinath (@shwina) - @brandon-b-miller URL: #7213
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -1318,6 +1318,26 @@ def _repr_html_(self):
     def _repr_latex_(self):
         return self._get_renderable_dataframe().to_pandas()._repr_latex_()
 
+    def _get_columns_by_label(self, labels, downcast=False):
+        """
+        Return columns of dataframe by `labels`
+
+        If downcast is True, try and downcast from a DataFrame to a Series
+        """
+        new_data = super()._get_columns_by_label(labels, downcast)
+        if downcast:
+            if is_scalar(labels):
+                nlevels = 1
+            elif isinstance(labels, tuple):
+                nlevels = len(labels)
+            if self._data.multiindex is False or nlevels == self._data.nlevels:
+                return self._constructor_sliced(
+                    new_data, name=labels, index=self.index
+                )
+        return self._constructor(
+            new_data, columns=new_data.to_pandas_index(), index=self.index
+        )
+
     # unary, binary, rbinary, orderedcompare, unorderedcompare
     def _apply_op(self, fn, other=None, fill_value=None):
 
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
@@ -483,25 +483,12 @@ def equals(self, other, **kwargs):
         else:
             return self._index.equals(other._index)
 
-    def _get_columns_by_label(self, labels, downcast=False):
+    def _get_columns_by_label(self, labels, downcast):
         """
         Returns columns of the Frame specified by `labels`
 
-        If downcast is True, try and downcast from a DataFrame to a Series
-        """
-        new_data = self._data.select_by_label(labels)
-        if downcast:
-            if is_scalar(labels):
-                nlevels = 1
-            elif isinstance(labels, tuple):
-                nlevels = len(labels)
-            if self._data.multiindex is False or nlevels == self._data.nlevels:
-                return self._constructor_sliced(
-                    new_data, name=labels, index=self.index
-                )
-        return self._constructor(
-            new_data, columns=new_data.to_pandas_index(), index=self.index
-        )
+        """
+        return self._data.select_by_label(labels)
 
     def _get_columns_by_index(self, indices):
         """
@@ -1643,10 +1630,16 @@ def rank(
                 "na_option must be one of 'keep', 'top', or 'bottom'"
             )
 
-        # TODO code for selecting numeric columns
         source = self
         if numeric_only:
-            warnings.warn("numeric_only=True is not implemented yet")
+            numeric_cols = (
+                name
+                for name in self._data.names
+                if is_numerical_dtype(self._data[name])
+            )
+            source = self._get_columns_by_label(numeric_cols)
+            if source.empty:
+                return source.astype("float64")
 
         out_rank_table = libcudf.sort.rank_columns(
             source, method_enum, na_option, ascending, pct
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
@@ -405,6 +405,20 @@ def _copy_construct(self, **kwargs):
         params.update(kwargs)
         return cls(**params)
 
+    def _get_columns_by_label(self, labels, downcast=False):
+        """Return the column specified by `labels`
+
+        For cudf.Series, either the column, or an empty series is returned.
+        Parameter `downcast` does not have effects.
+        """
+        new_data = super()._get_columns_by_label(labels, downcast)
+
+        return (
+            self._constructor(data=new_data, index=self.index)
+            if len(new_data) > 0
+            else self._constructor(dtype=self.dtype, name=self.name)
+        )
+
     @classmethod
     def from_arrow(cls, array):
         """
diff --git a/python/cudf/cudf/tests/test_rank.py b/python/cudf/cudf/tests/test_rank.py
@@ -10,152 +10,147 @@
 from cudf.tests.utils import assert_eq, assert_exceptions_equal
 
 
-class TestRank:
-    index = np.array([5, 4, 3, 2, 1, 6, 7, 8, 9, 10])
-    col1 = np.array([5, 4, 3, 5, 8, 5, 2, 1, 6, 6])
-    col2 = np.array([5, 4, np.nan, 5, 8, 5, np.inf, np.nan, 6, -np.inf])
-
-    @pytest.mark.parametrize("dtype", ["O", "f8", "i4"])
-    @pytest.mark.parametrize("ascending", [True, False])
-    @pytest.mark.parametrize(
-        "method", ["average", "min", "max", "first", "dense"]
-    )
-    @pytest.mark.parametrize("na_option", ["keep", "top", "bottom"])
-    @pytest.mark.parametrize("pct", [True, False])
-    def test_rank_all_arguments(
-        self, dtype, ascending, method, na_option, pct
-    ):
-        if method == "first" and dtype == "O":
-            # not supported by pandas
-            return
-        pdf = pd.DataFrame(index=self.index)
-        pdf["col1"] = self.col1.astype(dtype)
-        pdf["col2"] = self.col2.astype(dtype)
-        gdf = DataFrame.from_pandas(pdf)
-
-        def _check(gs, ps, method, na_option, ascending, pct):
-            ranked_gs = gs.rank(
-                method=method,
-                na_option=na_option,
-                ascending=ascending,
-                pct=pct,
-            )
-            ranked_ps = ps.rank(
-                method=method,
-                na_option=na_option,
-                ascending=ascending,
-                pct=pct,
-            )
-            assert_eq(ranked_ps, ranked_gs.to_pandas())
-
-        # # Series
-        _check(
-            gdf["col1"],
-            pdf["col1"],
-            method=method,
-            na_option=na_option,
-            ascending=ascending,
-            pct=pct,
-        )
-        _check(
-            gdf["col2"],
-            pdf["col2"],
-            method=method,
-            na_option=na_option,
-            ascending=ascending,
-            pct=pct,
-        )
-        # TODO: https://github.com/pandas-dev/pandas/issues/32593
-        # Dataframe (bug in pandas)
-        # _check(
-        #     gdf,
-        #     pdf,
-        #     method=method,
-        #     na_option=na_option,
-        #     ascending=ascending,
-        #     pct=pct,
-        # )
-
-    def test_rank_error_arguments(self):
-        pdf = pd.DataFrame(index=self.index)
-        pdf["col1"] = self.col1
-        pdf["col2"] = self.col2
-        gdf = DataFrame.from_pandas(pdf)
-
-        assert_exceptions_equal(
-            lfunc=pdf["col1"].rank,
-            rfunc=gdf["col1"].rank,
-            lfunc_args_and_kwargs=(
-                [],
-                {
-                    "method": "randomname",
-                    "na_option": "keep",
-                    "ascending": True,
-                    "pct": True,
-                },
+@pytest.fixture
+def pdf():
+    return pd.DataFrame(
+        {
+            "col1": np.array([5, 4, 3, 5, 8, 5, 2, 1, 6, 6]),
+            "col2": np.array(
+                [5, 4, np.nan, 5, 8, 5, np.inf, np.nan, 6, -np.inf]
             ),
-            rfunc_args_and_kwargs=(
-                [],
-                {
-                    "method": "randomname",
-                    "na_option": "keep",
-                    "ascending": True,
-                    "pct": True,
-                },
-            ),
-        )
+        },
+        index=np.array([5, 4, 3, 2, 1, 6, 7, 8, 9, 10]),
+    )
 
-        assert_exceptions_equal(
-            lfunc=pdf["col1"].rank,
-            rfunc=gdf["col1"].rank,
-            lfunc_args_and_kwargs=(
-                [],
-                {
-                    "method": "first",
-                    "na_option": "randomname",
-                    "ascending": True,
-                    "pct": True,
-                },
-            ),
-            rfunc_args_and_kwargs=(
-                [],
-                {
-                    "method": "first",
-                    "na_option": "randomname",
-                    "ascending": True,
-                    "pct": True,
-                },
-            ),
+
+@pytest.mark.parametrize("dtype", ["O", "f8", "i4"])
+@pytest.mark.parametrize("ascending", [True, False])
+@pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"])
+@pytest.mark.parametrize("na_option", ["keep", "top", "bottom"])
+@pytest.mark.parametrize("pct", [True, False])
+@pytest.mark.parametrize("numeric_only", [True, False])
+def test_rank_all_arguments(
+    pdf, dtype, ascending, method, na_option, pct, numeric_only
+):
+    if method == "first" and dtype == "O":
+        # not supported by pandas
+        return
+
+    pdf = pdf.copy(deep=True)  # for parallel pytest
+    if numeric_only:
+        pdf["str"] = np.array(
+            ["a", "b", "c", "d", "e", "1", "2", "3", "4", "5"]
         )
+    gdf = DataFrame.from_pandas(pdf)
+
+    kwargs = {
+        "method": method,
+        "na_option": na_option,
+        "ascending": ascending,
+        "pct": pct,
+        "numeric_only": numeric_only,
+    }
+
+    # Series
+    assert_eq(gdf["col1"].rank(**kwargs), pdf["col1"].rank(**kwargs))
+    assert_eq(gdf["col2"].rank(**kwargs), pdf["col2"].rank(**kwargs))
+    if numeric_only:
+        expect = pdf["str"].rank(**kwargs)
+        got = gdf["str"].rank(**kwargs)
+        assert expect.empty == got.empty
+
+    # TODO: https://github.com/pandas-dev/pandas/issues/32593
+    # Dataframe (bug in pandas)
+    if (
+        na_option == "top"
+        and method == "first"
+        and not dtype == "O"
+        and ascending
+    ):
+        assert_eq(gdf.rank(**kwargs), pdf.rank(**kwargs))
+    else:
+        with pytest.raises(AssertionError, match="values are different"):
+            assert_eq(gdf.rank(**kwargs), pdf.rank(**kwargs))
+
+
+def test_rank_error_arguments(pdf):
+    gdf = DataFrame.from_pandas(pdf)
+
+    assert_exceptions_equal(
+        lfunc=pdf["col1"].rank,
+        rfunc=gdf["col1"].rank,
+        lfunc_args_and_kwargs=(
+            [],
+            {
+                "method": "randomname",
+                "na_option": "keep",
+                "ascending": True,
+                "pct": True,
+            },
+        ),
+        rfunc_args_and_kwargs=(
+            [],
+            {
+                "method": "randomname",
+                "na_option": "keep",
+                "ascending": True,
+                "pct": True,
+            },
+        ),
+    )
 
-    sort_group_args = [
-        np.full((3,), np.nan),
-        100 * np.random.random(10),
-        np.full((3,), np.inf),
-        np.full((3,), -np.inf),
-    ]
-    sort_dtype_args = [np.int32, np.float32, np.float64]
-    # TODO: np.int64, disabled because of bug
-    # https://github.com/pandas-dev/pandas/issues/32859
-
-    @pytest.mark.parametrize(
-        "elem,dtype",
-        list(
-            product(
-                combinations_with_replacement(sort_group_args, 4),
-                sort_dtype_args,
-            )
+    assert_exceptions_equal(
+        lfunc=pdf["col1"].rank,
+        rfunc=gdf["col1"].rank,
+        lfunc_args_and_kwargs=(
+            [],
+            {
+                "method": "first",
+                "na_option": "randomname",
+                "ascending": True,
+                "pct": True,
+            },
         ),
+        rfunc_args_and_kwargs=(
+            [],
+            {
+                "method": "first",
+                "na_option": "randomname",
+                "ascending": True,
+                "pct": True,
+            },
+        ),
+    )
+
+
+sort_group_args = [
+    np.full((3,), np.nan),
+    100 * np.random.random(10),
+    np.full((3,), np.inf),
+    np.full((3,), -np.inf),
+]
+sort_dtype_args = [np.int32, np.float32, np.float64]
+# TODO: np.int64, disabled because of bug
+# https://github.com/pandas-dev/pandas/issues/32859
+
+
+@pytest.mark.parametrize(
+    "elem,dtype",
+    list(
+        product(
+            combinations_with_replacement(sort_group_args, 4), sort_dtype_args,
+        )
+    ),
+)
+def test_series_rank_combinations(elem, dtype):
+    np.random.seed(0)
+    gdf = DataFrame()
+    gdf["a"] = aa = np.fromiter(chain.from_iterable(elem), np.float64).astype(
+        dtype
     )
-    def test_series_rank_combinations(self, elem, dtype):
-        np.random.seed(0)
-        gdf = DataFrame()
-        gdf["a"] = aa = np.fromiter(
-            chain.from_iterable(elem), np.float64
-        ).astype(dtype)
-        ranked_gs = gdf["a"].rank(method="first")
-        df = pd.DataFrame()
-        df["a"] = aa
-        ranked_ps = df["a"].rank(method="first")
-        # Check
-        assert_eq(ranked_ps, ranked_gs.to_pandas())
+    ranked_gs = gdf["a"].rank(method="first")
+    df = pd.DataFrame()
+    df["a"] = aa
+    ranked_ps = df["a"].rank(method="first")
+    # Check
+    assert_eq(ranked_ps, ranked_gs.to_pandas())