Skip to content

Commit 4c6a57c

Browse files
Add support for array-like inputs in cudf.get_dummies (#7181)
FIxes: #7031 This PR introduces array-like inputs support in `cudf.get_dummies`. I think in near future we will have to deprecate and adapt new name for `get_dummies`: pandas-dev/pandas#35724 Authors: - GALI PREM SAGAR (@galipremsagar) Approvers: - Keith Kraus (@kkraus14) URL: #7181
1 parent 4111cb7 commit 4c6a57c

File tree

3 files changed

+149
-55
lines changed

3 files changed

+149
-55
lines changed

python/cudf/cudf/core/reshape.py

+104-55
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
# Copyright (c) 2018-2020, NVIDIA CORPORATION.
1+
# Copyright (c) 2018-2021, NVIDIA CORPORATION.
2+
23
import itertools
34

45
import numpy as np
@@ -575,8 +576,8 @@ def get_dummies(
575576
576577
Parameters
577578
----------
578-
df : cudf.DataFrame
579-
dataframe to encode
579+
df : array-like, Series, or DataFrame
580+
Data of which to get dummy indicators.
580581
prefix : str, dict, or sequence, optional
581582
prefix to append. Either a str (to apply a constant prefix), dict
582583
mapping column names to prefixes, or sequence of prefixes to apply with
@@ -633,6 +634,22 @@ def get_dummies(
633634
1 0 1 0 0
634635
2 0 0 1 0
635636
3 0 0 0 1
637+
638+
>>> series = cudf.Series([1, 2, None, 2, 4])
639+
>>> series
640+
0 1
641+
1 2
642+
2 <NA>
643+
3 2
644+
4 4
645+
dtype: int64
646+
>>> cudf.get_dummies(series, dummy_na=True)
647+
null 1 2 4
648+
0 0 1 0 0
649+
1 0 0 1 0
650+
2 1 0 0 0
651+
3 0 0 1 0
652+
4 0 0 0 1
636653
"""
637654
if cats is None:
638655
cats = {}
@@ -642,66 +659,72 @@ def get_dummies(
642659
if drop_first:
643660
raise NotImplementedError("drop_first is not supported yet")
644661

645-
encode_fallback_dtypes = ["object", "category"]
662+
if isinstance(df, cudf.DataFrame):
663+
encode_fallback_dtypes = ["object", "category"]
646664

647-
if columns is None or len(columns) == 0:
648-
columns = df.select_dtypes(include=encode_fallback_dtypes).columns
665+
if columns is None or len(columns) == 0:
666+
columns = df.select_dtypes(include=encode_fallback_dtypes).columns
649667

650-
def length_check(obj, name):
651-
if cudf.utils.dtypes.is_list_like(obj):
652-
if len(obj) != len(columns):
653-
raise ValueError(
654-
f"Length of '{name}' ({len(obj)}) did not match the "
655-
f"length of the columns being encoded ({len(columns)})."
656-
)
668+
_length_check_params(prefix, columns, "prefix")
669+
_length_check_params(prefix_sep, columns, "prefix_sep")
657670

658-
length_check(prefix, "prefix")
659-
length_check(prefix_sep, "prefix_sep")
671+
if prefix is None:
672+
prefix = columns
660673

661-
if prefix is None:
662-
prefix = columns
674+
if isinstance(prefix, str):
675+
prefix_map = {}
676+
elif isinstance(prefix, dict):
677+
prefix_map = prefix
678+
else:
679+
prefix_map = dict(zip(columns, prefix))
663680

664-
if isinstance(prefix, str):
665-
prefix_map = {}
666-
elif isinstance(prefix, dict):
667-
prefix_map = prefix
668-
else:
669-
prefix_map = dict(zip(columns, prefix))
681+
if isinstance(prefix_sep, str):
682+
prefix_sep_map = {}
683+
elif isinstance(prefix_sep, dict):
684+
prefix_sep_map = prefix_sep
685+
else:
686+
prefix_sep_map = dict(zip(columns, prefix_sep))
670687

671-
if isinstance(prefix_sep, str):
672-
prefix_sep_map = {}
673-
elif isinstance(prefix_sep, dict):
674-
prefix_sep_map = prefix_sep
675-
else:
676-
prefix_sep_map = dict(zip(columns, prefix_sep))
688+
# If we have no columns to encode, we need to drop
689+
# fallback columns(if any)
690+
if len(columns) == 0:
691+
return df.select_dtypes(exclude=encode_fallback_dtypes)
692+
else:
693+
result_df = df.copy(deep=False)
694+
result_df.drop(columns=columns, inplace=True)
695+
696+
for name in columns:
697+
unique = _get_unique(column=df._data[name], dummy_na=dummy_na)
698+
699+
col_enc_df = df.one_hot_encoding(
700+
name,
701+
prefix=prefix_map.get(name, prefix),
702+
cats=cats.get(name, unique),
703+
prefix_sep=prefix_sep_map.get(name, prefix_sep),
704+
dtype=dtype,
705+
)
706+
for col in col_enc_df.columns.difference(df._data.names):
707+
result_df[col] = col_enc_df._data[col]
677708

678-
# If we have no columns to encode, we need to drop fallback columns(if any)
679-
if len(columns) == 0:
680-
return df.select_dtypes(exclude=encode_fallback_dtypes)
709+
return result_df
681710
else:
682-
result_df = df.drop(columns=columns)
683-
for name in columns:
684-
if isinstance(
685-
df[name]._column, cudf.core.column.CategoricalColumn
686-
):
687-
unique = df[name]._column.categories
688-
else:
689-
unique = df[name].unique()
690-
691-
if not dummy_na:
692-
if np.issubdtype(unique.dtype, np.floating):
693-
unique = unique.nans_to_nulls()
694-
unique = unique.dropna()
695-
696-
col_enc_df = df.one_hot_encoding(
697-
name,
698-
prefix=prefix_map.get(name, prefix),
699-
cats=cats.get(name, unique),
700-
prefix_sep=prefix_sep_map.get(name, prefix_sep),
701-
dtype=dtype,
702-
)
703-
for col in col_enc_df.columns.difference(df._data.names):
704-
result_df[col] = col_enc_df._data[col]
711+
ser = cudf.Series(df)
712+
unique = _get_unique(column=ser._column, dummy_na=dummy_na)
713+
714+
if hasattr(unique, "to_arrow"):
715+
cats = unique.to_arrow().to_pylist()
716+
else:
717+
cats = pd.Series(unique, dtype="object")
718+
719+
col_names = ["null" if cat is None else cat for cat in cats]
720+
721+
if prefix is not None:
722+
col_names = [f"{prefix}{prefix_sep}{cat}" for cat in col_names]
723+
724+
newcols = ser.one_hot_encoding(cats=cats, dtype=dtype)
725+
result_df = cudf.DataFrame(index=ser.index)
726+
for i, col in enumerate(newcols):
727+
result_df._data[col_names[i]] = col
705728

706729
return result_df
707730

@@ -1013,3 +1036,29 @@ def unstack(df, level, fill_value=None):
10131036
if result.index.nlevels == 1:
10141037
result.index = result.index.get_level_values(result.index.names[0])
10151038
return result
1039+
1040+
1041+
def _get_unique(column, dummy_na):
1042+
"""
1043+
Returns unique values in a column, if
1044+
dummy_na is False, nan's are also dropped.
1045+
"""
1046+
if isinstance(column, cudf.core.column.CategoricalColumn):
1047+
unique = column.categories
1048+
else:
1049+
unique = column.unique()
1050+
if not dummy_na:
1051+
if np.issubdtype(unique.dtype, np.floating):
1052+
unique = unique.nans_to_nulls()
1053+
unique = unique.dropna()
1054+
return unique
1055+
1056+
1057+
def _length_check_params(obj, columns, name):
1058+
if cudf.utils.dtypes.is_list_like(obj):
1059+
if len(obj) != len(columns):
1060+
raise ValueError(
1061+
f"Length of '{name}' ({len(obj)}) did not match the "
1062+
f"length of the columns being "
1063+
f"encoded ({len(columns)})."
1064+
)

python/cudf/cudf/core/series.py

+1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# Copyright (c) 2018-2021, NVIDIA CORPORATION.
2+
23
import pickle
34
import warnings
45
from collections import abc as abc

python/cudf/cudf/tests/test_onehot.py

+44
Original file line numberDiff line numberDiff line change
@@ -205,3 +205,47 @@ def test_get_dummies_with_nan():
205205
actual = cudf.get_dummies(df, dummy_na=True, columns=["a"])
206206

207207
utils.assert_eq(expected, actual)
208+
209+
210+
@pytest.mark.parametrize(
211+
"data",
212+
[
213+
cudf.Series(["abc", "l", "a", "abc", "z", "xyz"]),
214+
cudf.Index([None, 1, 2, 3.3, None, 0.2]),
215+
cudf.Series([0.1, 2, 3, None, np.nan]),
216+
cudf.Series([23678, 324, 1, 324], name="abc"),
217+
],
218+
)
219+
@pytest.mark.parametrize("prefix_sep", ["-", "#"])
220+
@pytest.mark.parametrize("prefix", [None, "hi"])
221+
@pytest.mark.parametrize("dtype", ["uint8", "int16"])
222+
def test_get_dummies_array_like(data, prefix_sep, prefix, dtype):
223+
expected = cudf.get_dummies(
224+
data, prefix=prefix, prefix_sep=prefix_sep, dtype=dtype
225+
)
226+
if isinstance(data, (cudf.Series, cudf.Index)):
227+
pd_data = data.to_pandas()
228+
else:
229+
pd_data = data
230+
231+
actual = pd.get_dummies(
232+
pd_data, prefix=prefix, prefix_sep=prefix_sep, dtype=dtype
233+
)
234+
utils.assert_eq(expected, actual)
235+
236+
237+
def test_get_dummies_array_like_with_nan():
238+
ser = cudf.Series([0.1, 2, 3, None, np.nan], nan_as_null=False)
239+
expected = cudf.DataFrame(
240+
{
241+
"a_null": [0, 0, 0, 1, 0],
242+
"a_0.1": [1, 0, 0, 0, 0],
243+
"a_2.0": [0, 1, 0, 0, 0],
244+
"a_3.0": [0, 0, 1, 0, 0],
245+
"a_nan": [0, 0, 0, 0, 1],
246+
},
247+
dtype="uint8",
248+
)
249+
actual = cudf.get_dummies(ser, dummy_na=True, prefix="a", prefix_sep="_")
250+
251+
utils.assert_eq(expected, actual)

0 commit comments

Comments
 (0)