1
- # Copyright (c) 2018-2020, NVIDIA CORPORATION.
1
+ # Copyright (c) 2018-2021, NVIDIA CORPORATION.
2
+
2
3
import itertools
3
4
4
5
import numpy as np
@@ -575,8 +576,8 @@ def get_dummies(
575
576
576
577
Parameters
577
578
----------
578
- df : cudf. DataFrame
579
- dataframe to encode
579
+ df : array-like, Series, or DataFrame
580
+ Data of which to get dummy indicators.
580
581
prefix : str, dict, or sequence, optional
581
582
prefix to append. Either a str (to apply a constant prefix), dict
582
583
mapping column names to prefixes, or sequence of prefixes to apply with
@@ -633,6 +634,22 @@ def get_dummies(
633
634
1 0 1 0 0
634
635
2 0 0 1 0
635
636
3 0 0 0 1
637
+
638
+ >>> series = cudf.Series([1, 2, None, 2, 4])
639
+ >>> series
640
+ 0 1
641
+ 1 2
642
+ 2 <NA>
643
+ 3 2
644
+ 4 4
645
+ dtype: int64
646
+ >>> cudf.get_dummies(series, dummy_na=True)
647
+ null 1 2 4
648
+ 0 0 1 0 0
649
+ 1 0 0 1 0
650
+ 2 1 0 0 0
651
+ 3 0 0 1 0
652
+ 4 0 0 0 1
636
653
"""
637
654
if cats is None :
638
655
cats = {}
@@ -642,66 +659,72 @@ def get_dummies(
642
659
if drop_first :
643
660
raise NotImplementedError ("drop_first is not supported yet" )
644
661
645
- encode_fallback_dtypes = ["object" , "category" ]
662
+ if isinstance (df , cudf .DataFrame ):
663
+ encode_fallback_dtypes = ["object" , "category" ]
646
664
647
- if columns is None or len (columns ) == 0 :
648
- columns = df .select_dtypes (include = encode_fallback_dtypes ).columns
665
+ if columns is None or len (columns ) == 0 :
666
+ columns = df .select_dtypes (include = encode_fallback_dtypes ).columns
649
667
650
- def length_check (obj , name ):
651
- if cudf .utils .dtypes .is_list_like (obj ):
652
- if len (obj ) != len (columns ):
653
- raise ValueError (
654
- f"Length of '{ name } ' ({ len (obj )} ) did not match the "
655
- f"length of the columns being encoded ({ len (columns )} )."
656
- )
668
+ _length_check_params (prefix , columns , "prefix" )
669
+ _length_check_params (prefix_sep , columns , "prefix_sep" )
657
670
658
- length_check ( prefix , " prefix" )
659
- length_check ( prefix_sep , "prefix_sep" )
671
+ if prefix is None :
672
+ prefix = columns
660
673
661
- if prefix is None :
662
- prefix = columns
674
+ if isinstance (prefix , str ):
675
+ prefix_map = {}
676
+ elif isinstance (prefix , dict ):
677
+ prefix_map = prefix
678
+ else :
679
+ prefix_map = dict (zip (columns , prefix ))
663
680
664
- if isinstance (prefix , str ):
665
- prefix_map = {}
666
- elif isinstance (prefix , dict ):
667
- prefix_map = prefix
668
- else :
669
- prefix_map = dict (zip (columns , prefix ))
681
+ if isinstance (prefix_sep , str ):
682
+ prefix_sep_map = {}
683
+ elif isinstance (prefix_sep , dict ):
684
+ prefix_sep_map = prefix_sep
685
+ else :
686
+ prefix_sep_map = dict (zip (columns , prefix_sep ))
670
687
671
- if isinstance (prefix_sep , str ):
672
- prefix_sep_map = {}
673
- elif isinstance (prefix_sep , dict ):
674
- prefix_sep_map = prefix_sep
675
- else :
676
- prefix_sep_map = dict (zip (columns , prefix_sep ))
688
+ # If we have no columns to encode, we need to drop
689
+ # fallback columns(if any)
690
+ if len (columns ) == 0 :
691
+ return df .select_dtypes (exclude = encode_fallback_dtypes )
692
+ else :
693
+ result_df = df .copy (deep = False )
694
+ result_df .drop (columns = columns , inplace = True )
695
+
696
+ for name in columns :
697
+ unique = _get_unique (column = df ._data [name ], dummy_na = dummy_na )
698
+
699
+ col_enc_df = df .one_hot_encoding (
700
+ name ,
701
+ prefix = prefix_map .get (name , prefix ),
702
+ cats = cats .get (name , unique ),
703
+ prefix_sep = prefix_sep_map .get (name , prefix_sep ),
704
+ dtype = dtype ,
705
+ )
706
+ for col in col_enc_df .columns .difference (df ._data .names ):
707
+ result_df [col ] = col_enc_df ._data [col ]
677
708
678
- # If we have no columns to encode, we need to drop fallback columns(if any)
679
- if len (columns ) == 0 :
680
- return df .select_dtypes (exclude = encode_fallback_dtypes )
709
+ return result_df
681
710
else :
682
- result_df = df .drop (columns = columns )
683
- for name in columns :
684
- if isinstance (
685
- df [name ]._column , cudf .core .column .CategoricalColumn
686
- ):
687
- unique = df [name ]._column .categories
688
- else :
689
- unique = df [name ].unique ()
690
-
691
- if not dummy_na :
692
- if np .issubdtype (unique .dtype , np .floating ):
693
- unique = unique .nans_to_nulls ()
694
- unique = unique .dropna ()
695
-
696
- col_enc_df = df .one_hot_encoding (
697
- name ,
698
- prefix = prefix_map .get (name , prefix ),
699
- cats = cats .get (name , unique ),
700
- prefix_sep = prefix_sep_map .get (name , prefix_sep ),
701
- dtype = dtype ,
702
- )
703
- for col in col_enc_df .columns .difference (df ._data .names ):
704
- result_df [col ] = col_enc_df ._data [col ]
711
+ ser = cudf .Series (df )
712
+ unique = _get_unique (column = ser ._column , dummy_na = dummy_na )
713
+
714
+ if hasattr (unique , "to_arrow" ):
715
+ cats = unique .to_arrow ().to_pylist ()
716
+ else :
717
+ cats = pd .Series (unique , dtype = "object" )
718
+
719
+ col_names = ["null" if cat is None else cat for cat in cats ]
720
+
721
+ if prefix is not None :
722
+ col_names = [f"{ prefix } { prefix_sep } { cat } " for cat in col_names ]
723
+
724
+ newcols = ser .one_hot_encoding (cats = cats , dtype = dtype )
725
+ result_df = cudf .DataFrame (index = ser .index )
726
+ for i , col in enumerate (newcols ):
727
+ result_df ._data [col_names [i ]] = col
705
728
706
729
return result_df
707
730
@@ -1013,3 +1036,29 @@ def unstack(df, level, fill_value=None):
1013
1036
if result .index .nlevels == 1 :
1014
1037
result .index = result .index .get_level_values (result .index .names [0 ])
1015
1038
return result
1039
+
1040
+
1041
+ def _get_unique (column , dummy_na ):
1042
+ """
1043
+ Returns unique values in a column, if
1044
+ dummy_na is False, nan's are also dropped.
1045
+ """
1046
+ if isinstance (column , cudf .core .column .CategoricalColumn ):
1047
+ unique = column .categories
1048
+ else :
1049
+ unique = column .unique ()
1050
+ if not dummy_na :
1051
+ if np .issubdtype (unique .dtype , np .floating ):
1052
+ unique = unique .nans_to_nulls ()
1053
+ unique = unique .dropna ()
1054
+ return unique
1055
+
1056
+
1057
+ def _length_check_params (obj , columns , name ):
1058
+ if cudf .utils .dtypes .is_list_like (obj ):
1059
+ if len (obj ) != len (columns ):
1060
+ raise ValueError (
1061
+ f"Length of '{ name } ' ({ len (obj )} ) did not match the "
1062
+ f"length of the columns being "
1063
+ f"encoded ({ len (columns )} )."
1064
+ )
0 commit comments