Skip to content

Commit cb570fe

Browse files
authored
Deprecate dtype= parameter in reduction methods (#16313)
In terms of pandas alignment, this argument doesn't exist in reduction ops. Additionally, the same result can be easily achieved by calling `astype` after the operation, and it appears libcudf does not support any arbitrary casting to an output type. Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: #16313
1 parent dc62177 commit cb570fe

File tree

7 files changed

+45
-40
lines changed

7 files changed

+45
-40
lines changed

python/cudf/cudf/_lib/reduce.pyx

+10-5
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
2+
import warnings
23

34
import cudf
45
from cudf.core.buffer import acquire_spill_lock
@@ -26,11 +27,15 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
2627
A numpy data type to use for the output, defaults
2728
to the same type as the input column
2829
"""
29-
30-
col_dtype = (
31-
dtype if dtype is not None
32-
else incol._reduction_result_dtype(reduction_op)
33-
)
30+
if dtype is not None:
31+
warnings.warn(
32+
"dtype is deprecated and will be remove in a future release. "
33+
"Cast the result (e.g. .astype) after the operation instead.",
34+
FutureWarning
35+
)
36+
col_dtype = dtype
37+
else:
38+
col_dtype = incol._reduction_result_dtype(reduction_op)
3439

3540
# check empty case
3641
if len(incol) <= incol.null_count:

python/cudf/cudf/core/column/column.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -261,7 +261,7 @@ def all(self, skipna: bool = True) -> bool:
261261
if self.null_count == self.size:
262262
return True
263263

264-
return libcudf.reduce.reduce("all", self, dtype=np.bool_)
264+
return libcudf.reduce.reduce("all", self)
265265

266266
def any(self, skipna: bool = True) -> bool:
267267
# Early exit for fast cases.
@@ -271,7 +271,7 @@ def any(self, skipna: bool = True) -> bool:
271271
elif skipna and self.null_count == self.size:
272272
return False
273273

274-
return libcudf.reduce.reduce("any", self, dtype=np.bool_)
274+
return libcudf.reduce.reduce("any", self)
275275

276276
def dropna(self) -> Self:
277277
if self.has_nulls():
@@ -1305,7 +1305,10 @@ def _reduce(
13051305
skipna=skipna, min_count=min_count
13061306
)
13071307
if isinstance(preprocessed, ColumnBase):
1308-
return libcudf.reduce.reduce(op, preprocessed, **kwargs)
1308+
dtype = kwargs.pop("dtype", None)
1309+
return libcudf.reduce.reduce(
1310+
op, preprocessed, dtype=dtype, **kwargs
1311+
)
13091312
return preprocessed
13101313

13111314
def _process_for_reduction(
@@ -1336,6 +1339,8 @@ def _reduction_result_dtype(self, reduction_op: str) -> Dtype:
13361339
Determine the correct dtype to pass to libcudf based on
13371340
the input dtype, data dtype, and specific reduction op
13381341
"""
1342+
if reduction_op in {"any", "all"}:
1343+
return np.dtype(np.bool_)
13391344
return self.dtype
13401345

13411346
def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:

python/cudf/cudf/core/column/datetime.py

+3-6
Original file line numberDiff line numberDiff line change
@@ -485,26 +485,23 @@ def as_string_column(self) -> cudf.core.column.StringColumn:
485485
format = format.split(" ")[0]
486486
return self.strftime(format)
487487

488-
def mean(
489-
self, skipna=None, min_count: int = 0, dtype=np.float64
490-
) -> ScalarLike:
488+
def mean(self, skipna=None, min_count: int = 0) -> ScalarLike:
491489
return pd.Timestamp(
492490
cast(
493491
"cudf.core.column.NumericalColumn", self.astype("int64")
494-
).mean(skipna=skipna, min_count=min_count, dtype=dtype),
492+
).mean(skipna=skipna, min_count=min_count),
495493
unit=self.time_unit,
496494
).as_unit(self.time_unit)
497495

498496
def std(
499497
self,
500498
skipna: bool | None = None,
501499
min_count: int = 0,
502-
dtype: Dtype = np.float64,
503500
ddof: int = 1,
504501
) -> pd.Timedelta:
505502
return pd.Timedelta(
506503
cast("cudf.core.column.NumericalColumn", self.astype("int64")).std(
507-
skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
504+
skipna=skipna, min_count=min_count, ddof=ddof
508505
)
509506
* _unit_to_nanoseconds_conversion[self.time_unit],
510507
).as_unit(self.time_unit)

python/cudf/cudf/core/column/numerical.py

+9-8
Original file line numberDiff line numberDiff line change
@@ -395,7 +395,7 @@ def all(self, skipna: bool = True) -> bool:
395395
if result_col.null_count == result_col.size:
396396
return True
397397

398-
return libcudf.reduce.reduce("all", result_col, dtype=np.bool_)
398+
return libcudf.reduce.reduce("all", result_col)
399399

400400
def any(self, skipna: bool = True) -> bool:
401401
# Early exit for fast cases.
@@ -406,7 +406,7 @@ def any(self, skipna: bool = True) -> bool:
406406
elif skipna and result_col.null_count == result_col.size:
407407
return False
408408

409-
return libcudf.reduce.reduce("any", result_col, dtype=np.bool_)
409+
return libcudf.reduce.reduce("any", result_col)
410410

411411
@functools.cached_property
412412
def nan_count(self) -> int:
@@ -684,15 +684,16 @@ def to_pandas(
684684
return super().to_pandas(nullable=nullable, arrow_type=arrow_type)
685685

686686
def _reduction_result_dtype(self, reduction_op: str) -> Dtype:
687-
col_dtype = self.dtype
688687
if reduction_op in {"sum", "product"}:
689-
col_dtype = (
690-
col_dtype if col_dtype.kind == "f" else np.dtype("int64")
691-
)
688+
if self.dtype.kind == "f":
689+
return self.dtype
690+
return np.dtype("int64")
692691
elif reduction_op == "sum_of_squares":
693-
col_dtype = np.result_dtype(col_dtype, np.dtype("uint64"))
692+
return np.result_dtype(self.dtype, np.dtype("uint64"))
693+
elif reduction_op in {"var", "std", "mean"}:
694+
return np.dtype("float64")
694695

695-
return col_dtype
696+
return super()._reduction_result_dtype(reduction_op)
696697

697698

698699
def _normalize_find_and_replace_input(

python/cudf/cudf/core/column/numerical_base.py

+3-8
Original file line numberDiff line numberDiff line change
@@ -144,32 +144,27 @@ def mean(
144144
self,
145145
skipna: bool | None = None,
146146
min_count: int = 0,
147-
dtype=np.float64,
148147
):
149-
return self._reduce(
150-
"mean", skipna=skipna, min_count=min_count, dtype=dtype
151-
)
148+
return self._reduce("mean", skipna=skipna, min_count=min_count)
152149

153150
def var(
154151
self,
155152
skipna: bool | None = None,
156153
min_count: int = 0,
157-
dtype=np.float64,
158154
ddof=1,
159155
):
160156
return self._reduce(
161-
"var", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
157+
"var", skipna=skipna, min_count=min_count, ddof=ddof
162158
)
163159

164160
def std(
165161
self,
166162
skipna: bool | None = None,
167163
min_count: int = 0,
168-
dtype=np.float64,
169164
ddof=1,
170165
):
171166
return self._reduce(
172-
"std", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
167+
"std", skipna=skipna, min_count=min_count, ddof=ddof
173168
)
174169

175170
def median(self, skipna: bool | None = None) -> NumericalBaseColumn:

python/cudf/cudf/core/column/timedelta.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -287,11 +287,11 @@ def as_timedelta_column(self, dtype: Dtype) -> TimeDeltaColumn:
287287
return self
288288
return libcudf.unary.cast(self, dtype=dtype)
289289

290-
def mean(self, skipna=None, dtype: Dtype = np.float64) -> pd.Timedelta:
290+
def mean(self, skipna=None) -> pd.Timedelta:
291291
return pd.Timedelta(
292292
cast(
293293
"cudf.core.column.NumericalColumn", self.astype("int64")
294-
).mean(skipna=skipna, dtype=dtype),
294+
).mean(skipna=skipna),
295295
unit=self.time_unit,
296296
).as_unit(self.time_unit)
297297

@@ -345,12 +345,11 @@ def std(
345345
self,
346346
skipna: bool | None = None,
347347
min_count: int = 0,
348-
dtype: Dtype = np.float64,
349348
ddof: int = 1,
350349
) -> pd.Timedelta:
351350
return pd.Timedelta(
352351
cast("cudf.core.column.NumericalColumn", self.astype("int64")).std(
353-
skipna=skipna, min_count=min_count, ddof=ddof, dtype=dtype
352+
skipna=skipna, min_count=min_count, ddof=ddof
354353
),
355354
unit=self.time_unit,
356355
).as_unit(self.time_unit)

python/cudf/cudf/tests/test_reductions.py

+9-6
Original file line numberDiff line numberDiff line change
@@ -248,16 +248,11 @@ def test_sum_masked(nelem):
248248

249249
def test_sum_boolean():
250250
s = Series(np.arange(100000))
251-
got = (s > 1).sum(dtype=np.int32)
251+
got = (s > 1).sum()
252252
expect = 99998
253253

254254
assert expect == got
255255

256-
got = (s > 1).sum(dtype=np.bool_)
257-
expect = True
258-
259-
assert expect == got
260-
261256

262257
def test_date_minmax():
263258
np_data = np.random.normal(size=10**3)
@@ -371,3 +366,11 @@ def test_reduction_column_multiindex():
371366
result = df.mean()
372367
expected = df.to_pandas().mean()
373368
assert_eq(result, expected)
369+
370+
371+
@pytest.mark.parametrize("op", ["sum", "product"])
372+
def test_dtype_deprecated(op):
373+
ser = cudf.Series(range(5))
374+
with pytest.warns(FutureWarning):
375+
result = getattr(ser, op)(dtype=np.dtype(np.int8))
376+
assert isinstance(result, np.int8)

0 commit comments

Comments
 (0)