10
10
11
11
import pandas as pd
12
12
13
-
14
13
from pandas .types .common import (_coerce_to_dtype ,
15
14
_ensure_int64 ,
16
15
needs_i8_conversion ,
64
63
args_transpose = 'axes to permute (int or label for object)' ,
65
64
optional_by = """
66
65
by : str or list of str
67
- Name or list of names which refer to the axis items.""" )
66
+ Name or list of names which refer to the axis items.""" ,
67
+ versionadded_get_dummies = "" ,
68
+ other_get_dummies = ""
69
+ )
68
70
69
71
70
72
def _single_replace (self , to_replace , method , inplace , limit ):
@@ -6069,3 +6071,283 @@ def logical_func(self, axis=None, bool_only=None, skipna=None, level=None,
6069
6071
# install the indexes
6070
6072
for _name , _indexer in indexing .get_indexers_list ():
6071
6073
NDFrame ._create_indexer (_name , _indexer )
6074
+
6075
+
6076
+ _shared_docs ['get_dummies' ] = """
6077
+ Convert categorical variable into dummy/indicator variables
6078
+
6079
+ %(versionadded_get_dummies)s
6080
+
6081
+ Parameters
6082
+ ----------
6083
+ data : array-like, Series, or DataFrame
6084
+ prefix : string, list of strings, or dict of strings, default None
6085
+ String to append DataFrame column names
6086
+ Pass a list with length equal to the number of columns
6087
+ when calling get_dummies on a DataFrame. Alternativly, `prefix`
6088
+ can be a dictionary mapping column names to prefixes.
6089
+ prefix_sep : string, default '_'
6090
+ If appending prefix, separator/delimiter to use. Or pass a
6091
+ list or dictionary as with `prefix.`
6092
+ dummy_na : bool, default False
6093
+ Add a column to indicate NaNs, if False NaNs are ignored.
6094
+ columns : list-like, default None
6095
+ Column names in the DataFrame to be encoded.
6096
+ If `columns` is None then all the columns with
6097
+ `object` or `category` dtype will be converted.
6098
+ sparse : bool, default False
6099
+ Whether the dummy columns should be sparse or not. Returns
6100
+ SparseDataFrame if `data` is a Series or if all columns are included.
6101
+ Otherwise returns a DataFrame with some SparseBlocks.
6102
+
6103
+ .. versionadded:: 0.16.1
6104
+ drop_first : bool, default False
6105
+ Whether to get k-1 dummies out of k categorical levels by removing the
6106
+ first level.
6107
+
6108
+ .. versionadded:: 0.18.0
6109
+ Returns
6110
+ -------
6111
+ dummies : DataFrame or SparseDataFrame
6112
+
6113
+ Examples
6114
+ --------
6115
+ >>> import pandas as pd
6116
+ >>> s = pd.Series(list('abca'))
6117
+
6118
+ >>> pd.get_dummies(s)
6119
+ a b c
6120
+ 0 1 0 0
6121
+ 1 0 1 0
6122
+ 2 0 0 1
6123
+ 3 1 0 0
6124
+
6125
+ >>> s1 = ['a', 'b', np.nan]
6126
+
6127
+ >>> pd.get_dummies(s1)
6128
+ a b
6129
+ 0 1 0
6130
+ 1 0 1
6131
+ 2 0 0
6132
+
6133
+ >>> pd.get_dummies(s1, dummy_na=True)
6134
+ a b NaN
6135
+ 0 1 0 0
6136
+ 1 0 1 0
6137
+ 2 0 0 1
6138
+
6139
+ >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
6140
+ 'C': [1, 2, 3]})
6141
+
6142
+ >>> pd.get_dummies(df, prefix=['col1', 'col2'])
6143
+ C col1_a col1_b col2_a col2_b col2_c
6144
+ 0 1 1 0 0 1 0
6145
+ 1 2 0 1 1 0 0
6146
+ 2 3 1 0 0 0 1
6147
+
6148
+ >>> pd.get_dummies(pd.Series(list('abcaa')))
6149
+ a b c
6150
+ 0 1 0 0
6151
+ 1 0 1 0
6152
+ 2 0 0 1
6153
+ 3 1 0 0
6154
+ 4 1 0 0
6155
+
6156
+ >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True))
6157
+ b c
6158
+ 0 0 0
6159
+ 1 1 0
6160
+ 2 0 1
6161
+ 3 0 0
6162
+ 4 0 0
6163
+
6164
+ See Also
6165
+ --------
6166
+ %(other_get_dummies)s
6167
+ Series.str.get_dummies
6168
+ """
6169
+
6170
+
6171
+ @Appender (_shared_docs ['get_dummies' ] % _shared_doc_kwargs )
6172
+ def get_dummies (data , prefix = None , prefix_sep = '_' , dummy_na = False ,
6173
+ columns = None , sparse = False , drop_first = False ):
6174
+ from pandas .tools .concat import concat
6175
+ from itertools import cycle
6176
+ from pandas .core .frame import DataFrame
6177
+
6178
+ if isinstance (data , DataFrame ):
6179
+ # determine columns being encoded
6180
+
6181
+ if columns is None :
6182
+ columns_to_encode = data .select_dtypes (
6183
+ include = ['object' , 'category' ]).columns
6184
+ else :
6185
+ columns_to_encode = columns
6186
+
6187
+ # validate prefixes and separator to avoid silently dropping cols
6188
+ def check_len (item , name ):
6189
+ length_msg = ("Length of '{0}' ({1}) did not match the length of "
6190
+ "the columns being encoded ({2})." )
6191
+
6192
+ if is_list_like (item ):
6193
+ if not len (item ) == len (columns_to_encode ):
6194
+ raise ValueError (length_msg .format (name , len (item ),
6195
+ len (columns_to_encode )))
6196
+
6197
+ check_len (prefix , 'prefix' )
6198
+ check_len (prefix_sep , 'prefix_sep' )
6199
+ if isinstance (prefix , compat .string_types ):
6200
+ prefix = cycle ([prefix ])
6201
+ if isinstance (prefix , dict ):
6202
+ prefix = [prefix [col ] for col in columns_to_encode ]
6203
+
6204
+ if prefix is None :
6205
+ prefix = columns_to_encode
6206
+
6207
+ # validate separators
6208
+ if isinstance (prefix_sep , compat .string_types ):
6209
+ prefix_sep = cycle ([prefix_sep ])
6210
+ elif isinstance (prefix_sep , dict ):
6211
+ prefix_sep = [prefix_sep [col ] for col in columns_to_encode ]
6212
+
6213
+ if set (columns_to_encode ) == set (data .columns ):
6214
+ with_dummies = []
6215
+ else :
6216
+ with_dummies = [data .drop (columns_to_encode , axis = 1 )]
6217
+
6218
+ for (col , pre , sep ) in zip (columns_to_encode , prefix , prefix_sep ):
6219
+
6220
+ dummy = _get_dummies_1d (data [col ], prefix = pre , prefix_sep = sep ,
6221
+ dummy_na = dummy_na , sparse = sparse ,
6222
+ drop_first = drop_first )
6223
+ with_dummies .append (dummy )
6224
+ result = concat (with_dummies , axis = 1 )
6225
+ else :
6226
+ result = _get_dummies_1d (data , prefix , prefix_sep , dummy_na ,
6227
+ sparse = sparse , drop_first = drop_first )
6228
+ return result
6229
+
6230
+
6231
+ def _get_dummies_1d (data , prefix , prefix_sep = '_' , dummy_na = False ,
6232
+ sparse = False , drop_first = False ):
6233
+ from pandas .core .sparse import SparseDataFrame , SparseSeries
6234
+ from pandas .sparse .array import SparseArray
6235
+ from pandas ._sparse import IntIndex
6236
+ from pandas .core .series import Series
6237
+ from pandas .core .frame import DataFrame
6238
+
6239
+ from pandas .core .categorical import _factorize_from_iterable
6240
+ # Series avoids inconsistent NaN handling
6241
+ codes , levels = _factorize_from_iterable (Series (data ))
6242
+
6243
+ def get_empty_Frame (data , sparse ):
6244
+ if isinstance (data , Series ):
6245
+ index = data .index
6246
+ else :
6247
+ index = np .arange (len (data ))
6248
+ if not sparse :
6249
+ return DataFrame (index = index )
6250
+ else :
6251
+ return SparseDataFrame (index = index )
6252
+
6253
+ # if all NaN
6254
+ if not dummy_na and len (levels ) == 0 :
6255
+ return get_empty_Frame (data , sparse )
6256
+
6257
+ codes = codes .copy ()
6258
+ if dummy_na :
6259
+ codes [codes == - 1 ] = len (levels )
6260
+ levels = np .append (levels , np .nan )
6261
+
6262
+ # if dummy_na, we just fake a nan level. drop_first will drop it again
6263
+ if drop_first and len (levels ) == 1 :
6264
+ return get_empty_Frame (data , sparse )
6265
+
6266
+ number_of_cols = len (levels )
6267
+
6268
+ if prefix is not None :
6269
+ dummy_cols = ['%s%s%s' % (prefix , prefix_sep , v ) for v in levels ]
6270
+ else :
6271
+ dummy_cols = levels
6272
+
6273
+ if isinstance (data , Series ):
6274
+ index = data .index
6275
+ else :
6276
+ index = None
6277
+
6278
+ if sparse :
6279
+ sparse_series = {}
6280
+ N = len (data )
6281
+ sp_indices = [[] for _ in range (len (dummy_cols ))]
6282
+ for ndx , code in enumerate (codes ):
6283
+ if code == - 1 :
6284
+ # Blank entries if not dummy_na and code == -1, #GH4446
6285
+ continue
6286
+ sp_indices [code ].append (ndx )
6287
+
6288
+ if drop_first :
6289
+ # remove first categorical level to avoid perfect collinearity
6290
+ # GH12042
6291
+ sp_indices = sp_indices [1 :]
6292
+ dummy_cols = dummy_cols [1 :]
6293
+ for col , ixs in zip (dummy_cols , sp_indices ):
6294
+ sarr = SparseArray (np .ones (len (ixs ), dtype = np .uint8 ),
6295
+ sparse_index = IntIndex (N , ixs ), fill_value = 0 ,
6296
+ dtype = np .uint8 )
6297
+ sparse_series [col ] = SparseSeries (data = sarr , index = index )
6298
+
6299
+ out = SparseDataFrame (sparse_series , index = index , columns = dummy_cols ,
6300
+ dtype = np .uint8 )
6301
+ return out
6302
+
6303
+ else :
6304
+ dummy_mat = np .eye (number_of_cols , dtype = np .uint8 ).take (codes , axis = 0 )
6305
+
6306
+ if not dummy_na :
6307
+ # reset NaN GH4446
6308
+ dummy_mat [codes == - 1 ] = 0
6309
+
6310
+ if drop_first :
6311
+ # remove first GH12042
6312
+ dummy_mat = dummy_mat [:, 1 :]
6313
+ dummy_cols = dummy_cols [1 :]
6314
+ return DataFrame (dummy_mat , index = index , columns = dummy_cols )
6315
+
6316
+
6317
+ def make_axis_dummies (frame , axis = 'minor' , transform = None ):
6318
+ """
6319
+ Construct 1-0 dummy variables corresponding to designated axis
6320
+ labels
6321
+
6322
+ Parameters
6323
+ ----------
6324
+ frame : DataFrame
6325
+ axis : {'major', 'minor'}, default 'minor'
6326
+ transform : function, default None
6327
+ Function to apply to axis labels first. For example, to
6328
+ get "day of week" dummies in a time series regression
6329
+ you might call::
6330
+
6331
+ make_axis_dummies(panel, axis='major',
6332
+ transform=lambda d: d.weekday())
6333
+ Returns
6334
+ -------
6335
+ dummies : DataFrame
6336
+ Column names taken from chosen axis
6337
+ """
6338
+ from pandas .core .frame import DataFrame
6339
+ from pandas .core .categorical import _factorize_from_iterable
6340
+
6341
+ numbers = {'major' : 0 , 'minor' : 1 }
6342
+ num = numbers .get (axis , axis )
6343
+
6344
+ items = frame .index .levels [num ]
6345
+ labels = frame .index .labels [num ]
6346
+ if transform is not None :
6347
+ mapped_items = items .map (transform )
6348
+ labels , items = _factorize_from_iterable (mapped_items .take (labels ))
6349
+
6350
+ values = np .eye (len (items ), dtype = float )
6351
+ values = values .take (labels , axis = 0 )
6352
+
6353
+ return DataFrame (values , columns = items , index = frame .index )
0 commit comments