Implement scan operations for decimal columns (#7707)

ChrisJar · web-flow · commit f38daf384a7c · 2021-03-24T23:21:53.000Z
This adds support for `cummin`, `cummax`, and `cumsum` in cuDF for columns with type `decimal` Authors: - @ChrisJar Approvers: - GALI PREM SAGAR (@galipremsagar) URL: #7707
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
@@ -72,6 +72,9 @@ def binary_operator(self, op, other, reflect=False):
         result.dtype.precision = _binop_precision(self.dtype, other.dtype, op)
         return result
 
+    def _apply_scan_op(self, op: str) -> ColumnBase:
+        return libcudf.reduce.scan(op, self, True)
+
     def as_decimal_column(
         self, dtype: Dtype, **kwargs
     ) -> "cudf.core.column.DecimalColumn":
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
@@ -4725,8 +4725,9 @@ def cumsum(self, axis=0, skipna=True, *args, **kwargs):
                 result_col[first_index:] = None
 
         # pandas always returns int64 dtype if original dtype is int or `bool`
-        if np.issubdtype(result_col.dtype, np.integer) or np.issubdtype(
-            result_col.dtype, np.bool_
+        if not is_decimal_dtype(result_col.dtype) and (
+            np.issubdtype(result_col.dtype, np.integer)
+            or np.issubdtype(result_col.dtype, np.bool_)
         ):
             return Series(
                 result_col.astype(np.int64)._apply_scan_op("sum"),
@@ -4774,6 +4775,11 @@ def cumprod(self, axis=0, skipna=True, *args, **kwargs):
         if axis not in (None, 0):
             raise NotImplementedError("axis parameter is not implemented yet")
 
+        if is_decimal_dtype(self.dtype):
+            raise NotImplementedError(
+                "cumprod does not currently support decimal types"
+            )
+
         skipna = True if skipna is None else skipna
 
         if skipna:
diff --git a/python/cudf/cudf/tests/test_scan.py b/python/cudf/cudf/tests/test_scan.py
@@ -6,6 +6,7 @@
 
 import cudf
 from cudf.tests.utils import INTEGER_TYPES, NUMERIC_TYPES, assert_eq, gen_rand
+from cudf.core.dtypes import Decimal64Dtype
 
 params_sizes = [0, 1, 2, 5]
 
@@ -61,6 +62,21 @@ def test_cumsum_masked():
         assert_eq(got, expected)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [Decimal64Dtype(8, 4), Decimal64Dtype(10, 5), Decimal64Dtype(12, 7)],
+)
+def test_cumsum_decimal(dtype):
+    data = ["243.32", "48.245", "-7234.298", np.nan, "-467.2"]
+    gser = cudf.Series(data).astype(dtype)
+    pser = pd.Series(data, dtype="float64")
+
+    got = gser.cumsum()
+    expected = cudf.Series.from_pandas(pser.cumsum()).astype(dtype)
+
+    assert_eq(got, expected)
+
+
 @pytest.mark.parametrize("dtype,nelem", list(_gen_params()))
 def test_cummin(dtype, nelem):
     if dtype == np.int8:
@@ -103,6 +119,21 @@ def test_cummin_masked():
         assert_eq(gs.cummin(), expected)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [Decimal64Dtype(8, 4), Decimal64Dtype(11, 6), Decimal64Dtype(14, 7)],
+)
+def test_cummin_decimal(dtype):
+    data = ["8394.294", np.nan, "-9940.444", np.nan, "-23.928"]
+    gser = cudf.Series(data).astype(dtype)
+    pser = pd.Series(data, dtype="float64")
+
+    got = gser.cummin()
+    expected = cudf.Series.from_pandas(pser.cummin()).astype(dtype)
+
+    assert_eq(got, expected)
+
+
 @pytest.mark.parametrize("dtype,nelem", list(_gen_params()))
 def test_cummax(dtype, nelem):
     if dtype == np.int8:
@@ -145,6 +176,21 @@ def test_cummax_masked():
         assert_eq(gs.cummax(), expected)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [Decimal64Dtype(8, 4), Decimal64Dtype(11, 6), Decimal64Dtype(14, 7)],
+)
+def test_cummax_decimal(dtype):
+    data = [np.nan, "54.203", "8.222", "644.32", "-562.272"]
+    gser = cudf.Series(data).astype(dtype)
+    pser = pd.Series(data, dtype="float64")
+
+    got = gser.cummax()
+    expected = cudf.Series.from_pandas(pser.cummax()).astype(dtype)
+
+    assert_eq(got, expected)
+
+
 @pytest.mark.parametrize("dtype,nelem", list(_gen_params()))
 def test_cumprod(dtype, nelem):
     if dtype == np.int8: