@@ -821,50 +821,61 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
821
821
from pandas .core .reshape .concat import concat
822
822
from itertools import cycle
823
823
824
+ dtypes_to_encode = ['object' , 'category' ]
825
+
824
826
if isinstance (data , DataFrame ):
825
827
# determine columns being encoded
826
-
827
828
if columns is None :
828
- columns_to_encode = data .select_dtypes (
829
- include = [ 'object' , 'category' ]). columns
829
+ data_to_encode = data .select_dtypes (
830
+ include = dtypes_to_encode )
830
831
else :
831
- columns_to_encode = columns
832
+ data_to_encode = data [ columns ]
832
833
833
834
# validate prefixes and separator to avoid silently dropping cols
834
835
def check_len (item , name ):
835
836
len_msg = ("Length of '{name}' ({len_item}) did not match the "
836
837
"length of the columns being encoded ({len_enc})." )
837
838
838
839
if is_list_like (item ):
839
- if not len (item ) == len (columns_to_encode ):
840
- len_msg = len_msg .format (name = name , len_item = len (item ),
841
- len_enc = len (columns_to_encode ))
840
+ if not len (item ) == data_to_encode .shape [1 ]:
841
+ len_msg = \
842
+ len_msg .format (name = name , len_item = len (item ),
843
+ len_enc = data_to_encode .shape [1 ])
842
844
raise ValueError (len_msg )
843
845
844
846
check_len (prefix , 'prefix' )
845
847
check_len (prefix_sep , 'prefix_sep' )
848
+
846
849
if isinstance (prefix , compat .string_types ):
847
850
prefix = cycle ([prefix ])
848
851
if isinstance (prefix , dict ):
849
- prefix = [prefix [col ] for col in columns_to_encode ]
852
+ prefix = [prefix [col ] for col in data_to_encode . columns ]
850
853
851
854
if prefix is None :
852
- prefix = columns_to_encode
855
+ prefix = data_to_encode . columns
853
856
854
857
# validate separators
855
858
if isinstance (prefix_sep , compat .string_types ):
856
859
prefix_sep = cycle ([prefix_sep ])
857
860
elif isinstance (prefix_sep , dict ):
858
- prefix_sep = [prefix_sep [col ] for col in columns_to_encode ]
861
+ prefix_sep = [prefix_sep [col ] for col in data_to_encode . columns ]
859
862
860
- if set (columns_to_encode ) == set (data .columns ):
863
+ if data_to_encode .shape == data .shape :
864
+ # Encoding the entire df, do not prepend any dropped columns
861
865
with_dummies = []
866
+ elif columns is not None :
867
+ # Encoding only cols specified in columns. Get all cols not in
868
+ # columns to prepend to result.
869
+ with_dummies = [data .drop (columns , axis = 1 )]
862
870
else :
863
- with_dummies = [data .drop (columns_to_encode , axis = 1 )]
864
-
865
- for (col , pre , sep ) in zip (columns_to_encode , prefix , prefix_sep ):
866
-
867
- dummy = _get_dummies_1d (data [col ], prefix = pre , prefix_sep = sep ,
871
+ # Encoding only object and category dtype columns. Get remaining
872
+ # columns to prepend to result.
873
+ with_dummies = [data .select_dtypes (exclude = dtypes_to_encode )]
874
+
875
+ for (col , pre , sep ) in zip (data_to_encode .iteritems (), prefix ,
876
+ prefix_sep ):
877
+ # col is (column_name, column), use just column data here
878
+ dummy = _get_dummies_1d (col [1 ], prefix = pre , prefix_sep = sep ,
868
879
dummy_na = dummy_na , sparse = sparse ,
869
880
drop_first = drop_first , dtype = dtype )
870
881
with_dummies .append (dummy )
0 commit comments