9
9
from pandas .core .series import Series
10
10
from pandas .core .frame import DataFrame
11
11
12
+ from pandas .core .sparse import SparseDataFrame , SparseSeries
13
+ from pandas ._sparse import IntIndex
14
+
12
15
from pandas .core .categorical import Categorical
13
16
from pandas .core .common import (notnull , _ensure_platform_int , _maybe_promote ,
14
17
isnull )
@@ -1005,7 +1008,7 @@ def convert_dummies(data, cat_variables, prefix_sep='_'):
1005
1008
1006
1009
1007
1010
def get_dummies (data , prefix = None , prefix_sep = '_' , dummy_na = False ,
1008
- columns = None ):
1011
+ columns = None , sparse = False ):
1009
1012
"""
1010
1013
Convert categorical variable into dummy/indicator variables
1011
1014
@@ -1026,6 +1029,8 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
1026
1029
Column names in the DataFrame to be encoded.
1027
1030
If `columns` is None then all the columns with
1028
1031
`object` or `category` dtype will be converted.
1032
+ sparse : bool, default False
1033
+ Whether the returned DataFrame should be sparse or not.
1029
1034
1030
1035
Returns
1031
1036
-------
@@ -1112,16 +1117,17 @@ def check_len(item, name):
1112
1117
with_dummies = [result ]
1113
1118
for (col , pre , sep ) in zip (columns_to_encode , prefix , prefix_sep ):
1114
1119
1115
- dummy = _get_dummies_1d (data [col ], prefix = pre ,
1116
- prefix_sep = sep , dummy_na = dummy_na )
1120
+ dummy = _get_dummies_1d (data [col ], prefix = pre , prefix_sep = sep ,
1121
+ dummy_na = dummy_na , sparse = sparse )
1117
1122
with_dummies .append (dummy )
1118
1123
result = concat (with_dummies , axis = 1 )
1119
1124
else :
1120
- result = _get_dummies_1d (data , prefix , prefix_sep , dummy_na )
1125
+ result = _get_dummies_1d (data , prefix , prefix_sep , dummy_na ,
1126
+ sparse = sparse )
1121
1127
return result
1122
1128
1123
1129
1124
- def _get_dummies_1d (data , prefix , prefix_sep = '_' , dummy_na = False ):
1130
+ def _get_dummies_1d (data , prefix , prefix_sep = '_' , dummy_na = False , sparse = False ):
1125
1131
# Series avoids inconsistent NaN handling
1126
1132
cat = Categorical .from_array (Series (data ))
1127
1133
levels = cat .categories
@@ -1132,19 +1138,17 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False):
1132
1138
index = data .index
1133
1139
else :
1134
1140
index = np .arange (len (data ))
1135
- return DataFrame (index = index )
1136
-
1137
- number_of_cols = len (levels )
1138
- if dummy_na :
1139
- number_of_cols += 1
1140
-
1141
- dummy_mat = np .eye (number_of_cols ).take (cat .codes , axis = 0 )
1141
+ if not sparse :
1142
+ return DataFrame (index = index )
1143
+ else :
1144
+ return SparseDataFrame (index = index )
1142
1145
1146
+ codes = cat .codes .copy ()
1143
1147
if dummy_na :
1148
+ codes [codes == - 1 ] = len (cat .categories )
1144
1149
levels = np .append (cat .categories , np .nan )
1145
- else :
1146
- # reset NaN GH4446
1147
- dummy_mat [cat .codes == - 1 ] = 0
1150
+
1151
+ number_of_cols = len (levels )
1148
1152
1149
1153
if prefix is not None :
1150
1154
dummy_cols = ['%s%s%s' % (prefix , prefix_sep , v )
@@ -1157,7 +1161,29 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False):
1157
1161
else :
1158
1162
index = None
1159
1163
1160
- return DataFrame (dummy_mat , index = index , columns = dummy_cols )
1164
+ if sparse :
1165
+ sparse_series = {}
1166
+ N = len (data )
1167
+ for code , col in enumerate (dummy_cols ):
1168
+ if code != - 1 :
1169
+ sp_index = np .flatnonzero (codes == code )
1170
+ sp_data = np .ones (len (sp_index ))
1171
+ else : # Blank entries if not dummy_na and code == -1, #GH4446
1172
+ sp_index , sp_data = [], []
1173
+
1174
+ sparse_series [col ] = SparseSeries (data = np .array (sp_data ),
1175
+ sparse_index = IntIndex (N , sp_index ), index = index , fill_value = 0 )
1176
+
1177
+ return SparseDataFrame (sparse_series , index = index , default_fill_value = 0 )
1178
+
1179
+ else :
1180
+ dummy_mat = np .eye (number_of_cols ).take (codes , axis = 0 )
1181
+
1182
+ if not dummy_na :
1183
+ # reset NaN GH4446
1184
+ dummy_mat [codes == - 1 ] = 0
1185
+
1186
+ return DataFrame (dummy_mat , index = index , columns = dummy_cols )
1161
1187
1162
1188
1163
1189
def make_axis_dummies (frame , axis = 'minor' , transform = None ):
0 commit comments