Skip to content

Commit ebafb72

Browse files
author
Artemy Kolchinsky
committed
ENH: Allow get_dummies to return sparse dataframe
ENH: Allow get_dummies to return sparse dataframe ENH: Allow get_dummies to return sparse dataframe Fix Fix Fixes
1 parent c03e92f commit ebafb72

File tree

2 files changed

+72
-41
lines changed

2 files changed

+72
-41
lines changed

pandas/core/reshape.py

+42-16
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99
from pandas.core.series import Series
1010
from pandas.core.frame import DataFrame
1111

12+
from pandas.core.sparse import SparseDataFrame, SparseSeries
13+
from pandas._sparse import IntIndex
14+
1215
from pandas.core.categorical import Categorical
1316
from pandas.core.common import (notnull, _ensure_platform_int, _maybe_promote,
1417
isnull)
@@ -1005,7 +1008,7 @@ def convert_dummies(data, cat_variables, prefix_sep='_'):
10051008

10061009

10071010
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
1008-
columns=None):
1011+
columns=None, sparse=False):
10091012
"""
10101013
Convert categorical variable into dummy/indicator variables
10111014
@@ -1026,6 +1029,8 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
10261029
Column names in the DataFrame to be encoded.
10271030
If `columns` is None then all the columns with
10281031
`object` or `category` dtype will be converted.
1032+
sparse : bool, default False
1033+
Whether the returned DataFrame should be sparse or not.
10291034
10301035
Returns
10311036
-------
@@ -1112,16 +1117,17 @@ def check_len(item, name):
11121117
with_dummies = [result]
11131118
for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep):
11141119

1115-
dummy = _get_dummies_1d(data[col], prefix=pre,
1116-
prefix_sep=sep, dummy_na=dummy_na)
1120+
dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep,
1121+
dummy_na=dummy_na, sparse=sparse)
11171122
with_dummies.append(dummy)
11181123
result = concat(with_dummies, axis=1)
11191124
else:
1120-
result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na)
1125+
result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na,
1126+
sparse=sparse)
11211127
return result
11221128

11231129

1124-
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False):
1130+
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False):
11251131
# Series avoids inconsistent NaN handling
11261132
cat = Categorical.from_array(Series(data))
11271133
levels = cat.categories
@@ -1132,19 +1138,17 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False):
11321138
index = data.index
11331139
else:
11341140
index = np.arange(len(data))
1135-
return DataFrame(index=index)
1136-
1137-
number_of_cols = len(levels)
1138-
if dummy_na:
1139-
number_of_cols += 1
1140-
1141-
dummy_mat = np.eye(number_of_cols).take(cat.codes, axis=0)
1141+
if not sparse:
1142+
return DataFrame(index=index)
1143+
else:
1144+
return SparseDataFrame(index=index)
11421145

1146+
codes = cat.codes.copy()
11431147
if dummy_na:
1148+
codes[codes == -1] = len(cat.categories)
11441149
levels = np.append(cat.categories, np.nan)
1145-
else:
1146-
# reset NaN GH4446
1147-
dummy_mat[cat.codes == -1] = 0
1150+
1151+
number_of_cols = len(levels)
11481152

11491153
if prefix is not None:
11501154
dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v)
@@ -1157,7 +1161,29 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False):
11571161
else:
11581162
index = None
11591163

1160-
return DataFrame(dummy_mat, index=index, columns=dummy_cols)
1164+
if sparse:
1165+
sparse_series = {}
1166+
N = len(data)
1167+
for code, col in enumerate(dummy_cols):
1168+
if code != -1:
1169+
sp_index = np.flatnonzero(codes == code)
1170+
sp_data = np.ones(len(sp_index))
1171+
else: # Blank entries if not dummy_na and code == -1, #GH4446
1172+
sp_index, sp_data = [], []
1173+
1174+
sparse_series[col] = SparseSeries(data=np.array(sp_data),
1175+
sparse_index=IntIndex(N, sp_index), index=index, fill_value=0)
1176+
1177+
return SparseDataFrame(sparse_series, index=index, default_fill_value=0)
1178+
1179+
else:
1180+
dummy_mat = np.eye(number_of_cols).take(codes, axis=0)
1181+
1182+
if not dummy_na:
1183+
# reset NaN GH4446
1184+
dummy_mat[codes == -1] = 0
1185+
1186+
return DataFrame(dummy_mat, index=index, columns=dummy_cols)
11611187

11621188

11631189
def make_axis_dummies(frame, axis='minor', transform=None):

pandas/tests/test_reshape.py

+30-25
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,8 @@ def test_multiindex(self):
150150

151151
class TestGetDummies(tm.TestCase):
152152

153+
sparse = False
154+
153155
def setUp(self):
154156
self.df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'],
155157
'C': [1, 2, 3]})
@@ -162,20 +164,20 @@ def test_basic(self):
162164
expected = DataFrame({'a': {0: 1.0, 1: 0.0, 2: 0.0},
163165
'b': {0: 0.0, 1: 1.0, 2: 0.0},
164166
'c': {0: 0.0, 1: 0.0, 2: 1.0}})
165-
assert_frame_equal(get_dummies(s_list), expected)
166-
assert_frame_equal(get_dummies(s_series), expected)
167+
assert_frame_equal(get_dummies(s_list, sparse=self.sparse), expected)
168+
assert_frame_equal(get_dummies(s_series, sparse=self.sparse), expected)
167169

168170
expected.index = list('ABC')
169-
assert_frame_equal(get_dummies(s_series_index), expected)
171+
assert_frame_equal(get_dummies(s_series_index, sparse=self.sparse), expected)
170172

171173
def test_just_na(self):
172174
just_na_list = [np.nan]
173175
just_na_series = Series(just_na_list)
174176
just_na_series_index = Series(just_na_list, index = ['A'])
175177

176-
res_list = get_dummies(just_na_list)
177-
res_series = get_dummies(just_na_series)
178-
res_series_index = get_dummies(just_na_series_index)
178+
res_list = get_dummies(just_na_list, sparse=self.sparse)
179+
res_series = get_dummies(just_na_series, sparse=self.sparse)
180+
res_series_index = get_dummies(just_na_series_index, sparse=self.sparse)
179181

180182
self.assertEqual(res_list.empty, True)
181183
self.assertEqual(res_series.empty, True)
@@ -187,20 +189,21 @@ def test_just_na(self):
187189

188190
def test_include_na(self):
189191
s = ['a', 'b', np.nan]
190-
res = get_dummies(s)
192+
res = get_dummies(s, sparse=self.sparse)
191193
exp = DataFrame({'a': {0: 1.0, 1: 0.0, 2: 0.0},
192194
'b': {0: 0.0, 1: 1.0, 2: 0.0}})
193195
assert_frame_equal(res, exp)
194196

195-
res_na = get_dummies(s, dummy_na=True)
197+
# Sparse dataframes do not allow nan labelled columns, see #GH8822
198+
res_na = get_dummies(s, dummy_na=True, sparse=self.sparse)
196199
exp_na = DataFrame({nan: {0: 0.0, 1: 0.0, 2: 1.0},
197200
'a': {0: 1.0, 1: 0.0, 2: 0.0},
198201
'b': {0: 0.0, 1: 1.0, 2: 0.0}}).reindex_axis(['a', 'b', nan], 1)
199202
# hack (NaN handling in assert_index_equal)
200203
exp_na.columns = res_na.columns
201204
assert_frame_equal(res_na, exp_na)
202205

203-
res_just_na = get_dummies([nan], dummy_na=True)
206+
res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse)
204207
exp_just_na = DataFrame(Series(1.0,index=[0]),columns=[nan])
205208
assert_array_equal(res_just_na.values, exp_just_na.values)
206209

@@ -209,21 +212,21 @@ def test_unicode(self): # See GH 6885 - get_dummies chokes on unicode values
209212
e = 'e'
210213
eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE')
211214
s = [e, eacute, eacute]
212-
res = get_dummies(s, prefix='letter')
215+
res = get_dummies(s, prefix='letter', sparse=self.sparse)
213216
exp = DataFrame({'letter_e': {0: 1.0, 1: 0.0, 2: 0.0},
214217
u('letter_%s') % eacute: {0: 0.0, 1: 1.0, 2: 1.0}})
215218
assert_frame_equal(res, exp)
216219

217220
def test_dataframe_dummies_all_obj(self):
218221
df = self.df[['A', 'B']]
219-
result = get_dummies(df)
222+
result = get_dummies(df, sparse=self.sparse)
220223
expected = DataFrame({'A_a': [1., 0, 1], 'A_b': [0., 1, 0],
221224
'B_b': [1., 1, 0], 'B_c': [0., 0, 1]})
222225
assert_frame_equal(result, expected)
223226

224227
def test_dataframe_dummies_mix_default(self):
225228
df = self.df
226-
result = get_dummies(df)
229+
result = get_dummies(df, sparse=self.sparse)
227230
expected = DataFrame({'C': [1, 2, 3], 'A_a': [1., 0, 1],
228231
'A_b': [0., 1, 0], 'B_b': [1., 1, 0],
229232
'B_c': [0., 0, 1]})
@@ -234,18 +237,18 @@ def test_dataframe_dummies_prefix_list(self):
234237
prefixes = ['from_A', 'from_B']
235238
df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'],
236239
'C': [1, 2, 3]})
237-
result = get_dummies(df, prefix=prefixes)
240+
result = get_dummies(df, prefix=prefixes, sparse=self.sparse)
238241
expected = DataFrame({'C': [1, 2, 3], 'from_A_a': [1., 0, 1],
239242
'from_A_b': [0., 1, 0], 'from_B_b': [1., 1, 0],
240243
'from_B_c': [0., 0, 1]})
241244
expected = expected[['C', 'from_A_a', 'from_A_b', 'from_B_b',
242245
'from_B_c']]
243246
assert_frame_equal(result, expected)
244247

245-
def test_datafrmae_dummies_prefix_str(self):
248+
def test_dataframe_dummies_prefix_str(self):
246249
# not that you should do this...
247250
df = self.df
248-
result = get_dummies(df, prefix='bad')
251+
result = get_dummies(df, prefix='bad', sparse=self.sparse)
249252
expected = DataFrame([[1, 1., 0., 1., 0.],
250253
[2, 0., 1., 1., 0.],
251254
[3, 1., 0., 0., 1.]],
@@ -255,40 +258,40 @@ def test_datafrmae_dummies_prefix_str(self):
255258
def test_dataframe_dummies_subset(self):
256259
df = self.df
257260
result = get_dummies(df, prefix=['from_A'],
258-
columns=['A'])
261+
columns=['A'], sparse=self.sparse)
259262
expected = DataFrame({'from_A_a': [1., 0, 1], 'from_A_b': [0., 1, 0],
260263
'B': ['b', 'b', 'c'], 'C': [1, 2, 3]})
261264
assert_frame_equal(result, expected)
262265

263266
def test_dataframe_dummies_prefix_sep(self):
264267
df = self.df
265-
result = get_dummies(df, prefix_sep='..')
268+
result = get_dummies(df, prefix_sep='..', sparse=self.sparse)
266269
expected = DataFrame({'C': [1, 2, 3], 'A..a': [1., 0, 1],
267270
'A..b': [0., 1, 0], 'B..b': [1., 1, 0],
268271
'B..c': [0., 0, 1]})
269272
expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']]
270273
assert_frame_equal(result, expected)
271274

272-
result = get_dummies(df, prefix_sep=['..', '__'])
275+
result = get_dummies(df, prefix_sep=['..', '__'], sparse=self.sparse)
273276
expected = expected.rename(columns={'B..b': 'B__b', 'B..c': 'B__c'})
274277
assert_frame_equal(result, expected)
275278

276-
result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'})
279+
result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'}, sparse=self.sparse)
277280
assert_frame_equal(result, expected)
278281

279282
def test_dataframe_dummies_prefix_bad_length(self):
280283
with tm.assertRaises(ValueError):
281-
get_dummies(self.df, prefix=['too few'])
284+
get_dummies(self.df, prefix=['too few'], sparse=self.sparse)
282285

283286
def test_dataframe_dummies_prefix_sep_bad_length(self):
284287
with tm.assertRaises(ValueError):
285-
get_dummies(self.df, prefix_sep=['bad'])
288+
get_dummies(self.df, prefix_sep=['bad'], sparse=self.sparse)
286289

287290
def test_dataframe_dummies_prefix_dict(self):
288291
prefixes = {'A': 'from_A', 'B': 'from_B'}
289292
df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'],
290293
'C': [1, 2, 3]})
291-
result = get_dummies(df, prefix=prefixes)
294+
result = get_dummies(df, prefix=prefixes, sparse=self.sparse)
292295
expected = DataFrame({'from_A_a': [1., 0, 1], 'from_A_b': [0., 1, 0],
293296
'from_B_b': [1., 1, 0], 'from_B_c': [0., 0, 1],
294297
'C': [1, 2, 3]})
@@ -297,22 +300,22 @@ def test_dataframe_dummies_prefix_dict(self):
297300
def test_dataframe_dummies_with_na(self):
298301
df = self.df
299302
df.loc[3, :] = [np.nan, np.nan, np.nan]
300-
result = get_dummies(df, dummy_na=True)
303+
result = get_dummies(df, dummy_na=True, sparse=self.sparse)
301304
expected = DataFrame({'C': [1, 2, 3, np.nan], 'A_a': [1., 0, 1, 0],
302305
'A_b': [0., 1, 0, 0], 'A_nan': [0., 0, 0, 1], 'B_b': [1., 1, 0, 0],
303306
'B_c': [0., 0, 1, 0], 'B_nan': [0., 0, 0, 1]})
304307
expected = expected[['C', 'A_a', 'A_b', 'A_nan', 'B_b', 'B_c',
305308
'B_nan']]
306309
assert_frame_equal(result, expected)
307310

308-
result = get_dummies(df, dummy_na=False)
311+
result = get_dummies(df, dummy_na=False, sparse=self.sparse)
309312
expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']]
310313
assert_frame_equal(result, expected)
311314

312315
def test_dataframe_dummies_with_categorical(self):
313316
df = self.df
314317
df['cat'] = pd.Categorical(['x', 'y', 'y'])
315-
result = get_dummies(df)
318+
result = get_dummies(df, sparse=self.sparse)
316319
expected = DataFrame({'C': [1, 2, 3], 'A_a': [1., 0, 1],
317320
'A_b': [0., 1, 0], 'B_b': [1., 1, 0],
318321
'B_c': [0., 0, 1], 'cat_x': [1., 0, 0],
@@ -321,6 +324,8 @@ def test_dataframe_dummies_with_categorical(self):
321324
'cat_x', 'cat_y']]
322325
assert_frame_equal(result, expected)
323326

327+
class TestGetDummiesSparse(TestGetDummies):
328+
sparse = True
324329

325330
class TestConvertDummies(tm.TestCase):
326331
def test_convert_dummies(self):

0 commit comments

Comments
 (0)