Skip to content

Commit f04a698

Browse files
author
Artemy Kolchinsky
committed
ENH: Allow get_dummies to return sparse dataframe
ENH: Allow get_dummies to return sparse dataframe ENH: Allow get_dummies to return sparse dataframe Fix Fix Fixes Bug in order of columns Slight speed improvement
1 parent a477202 commit f04a698

File tree

2 files changed

+104
-41
lines changed

2 files changed

+104
-41
lines changed

pandas/core/reshape.py

+45-16
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@
99
from pandas.core.series import Series
1010
from pandas.core.frame import DataFrame
1111

12+
from pandas.core.sparse import SparseDataFrame, SparseSeries
13+
from pandas.sparse.array import SparseArray
14+
from pandas._sparse import IntIndex
15+
1216
from pandas.core.categorical import Categorical
1317
from pandas.core.common import (notnull, _ensure_platform_int, _maybe_promote,
1418
isnull)
@@ -932,7 +936,7 @@ def melt_stub(df, stub, i, j):
932936
return newdf.set_index([i, j])
933937

934938
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
935-
columns=None):
939+
columns=None, sparse=False):
936940
"""
937941
Convert categorical variable into dummy/indicator variables
938942
@@ -953,6 +957,8 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
953957
Column names in the DataFrame to be encoded.
954958
If `columns` is None then all the columns with
955959
`object` or `category` dtype will be converted.
960+
sparse : bool, default False
961+
Whether the returned DataFrame should be sparse or not.
956962
957963
Returns
958964
-------
@@ -1039,16 +1045,17 @@ def check_len(item, name):
10391045
with_dummies = [result]
10401046
for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep):
10411047

1042-
dummy = _get_dummies_1d(data[col], prefix=pre,
1043-
prefix_sep=sep, dummy_na=dummy_na)
1048+
dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep,
1049+
dummy_na=dummy_na, sparse=sparse)
10441050
with_dummies.append(dummy)
10451051
result = concat(with_dummies, axis=1)
10461052
else:
1047-
result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na)
1053+
result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na,
1054+
sparse=sparse)
10481055
return result
10491056

10501057

1051-
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False):
1058+
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False):
10521059
# Series avoids inconsistent NaN handling
10531060
cat = Categorical.from_array(Series(data), ordered=True)
10541061
levels = cat.categories
@@ -1059,19 +1066,17 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False):
10591066
index = data.index
10601067
else:
10611068
index = np.arange(len(data))
1062-
return DataFrame(index=index)
1063-
1064-
number_of_cols = len(levels)
1065-
if dummy_na:
1066-
number_of_cols += 1
1067-
1068-
dummy_mat = np.eye(number_of_cols).take(cat.codes, axis=0)
1069+
if not sparse:
1070+
return DataFrame(index=index)
1071+
else:
1072+
return SparseDataFrame(index=index)
10691073

1074+
codes = cat.codes.copy()
10701075
if dummy_na:
1076+
codes[codes == -1] = len(cat.categories)
10711077
levels = np.append(cat.categories, np.nan)
1072-
else:
1073-
# reset NaN GH4446
1074-
dummy_mat[cat.codes == -1] = 0
1078+
1079+
number_of_cols = len(levels)
10751080

10761081
if prefix is not None:
10771082
dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v)
@@ -1084,7 +1089,31 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False):
10841089
else:
10851090
index = None
10861091

1087-
return DataFrame(dummy_mat, index=index, columns=dummy_cols)
1092+
if sparse:
1093+
sparse_series = {}
1094+
N = len(data)
1095+
sp_indices = [ [] for _ in range(len(dummy_cols)) ]
1096+
for ndx, code in enumerate(codes):
1097+
if code == -1:
1098+
# Blank entries if not dummy_na and code == -1, #GH4446
1099+
continue
1100+
sp_indices[code].append(ndx)
1101+
1102+
for col, ixs in zip(dummy_cols, sp_indices):
1103+
sarr = SparseArray(np.ones(len(ixs)), sparse_index=IntIndex(N, ixs),
1104+
fill_value=0)
1105+
sparse_series[col] = SparseSeries(data=sarr, index=index)
1106+
1107+
return SparseDataFrame(sparse_series, index=index, columns=dummy_cols)
1108+
1109+
else:
1110+
dummy_mat = np.eye(number_of_cols).take(codes, axis=0)
1111+
1112+
if not dummy_na:
1113+
# reset NaN GH4446
1114+
dummy_mat[codes == -1] = 0
1115+
1116+
return DataFrame(dummy_mat, index=index, columns=dummy_cols)
10881117

10891118

10901119
def make_axis_dummies(frame, axis='minor', transform=None):

pandas/tests/test_reshape.py

+59-25
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,8 @@ def test_multiindex(self):
151151

152152
class TestGetDummies(tm.TestCase):
153153

154+
sparse = False
155+
154156
def setUp(self):
155157
self.df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'],
156158
'C': [1, 2, 3]})
@@ -163,20 +165,20 @@ def test_basic(self):
163165
expected = DataFrame({'a': {0: 1.0, 1: 0.0, 2: 0.0},
164166
'b': {0: 0.0, 1: 1.0, 2: 0.0},
165167
'c': {0: 0.0, 1: 0.0, 2: 1.0}})
166-
assert_frame_equal(get_dummies(s_list), expected)
167-
assert_frame_equal(get_dummies(s_series), expected)
168+
assert_frame_equal(get_dummies(s_list, sparse=self.sparse), expected)
169+
assert_frame_equal(get_dummies(s_series, sparse=self.sparse), expected)
168170

169171
expected.index = list('ABC')
170-
assert_frame_equal(get_dummies(s_series_index), expected)
172+
assert_frame_equal(get_dummies(s_series_index, sparse=self.sparse), expected)
171173

172174
def test_just_na(self):
173175
just_na_list = [np.nan]
174176
just_na_series = Series(just_na_list)
175177
just_na_series_index = Series(just_na_list, index = ['A'])
176178

177-
res_list = get_dummies(just_na_list)
178-
res_series = get_dummies(just_na_series)
179-
res_series_index = get_dummies(just_na_series_index)
179+
res_list = get_dummies(just_na_list, sparse=self.sparse)
180+
res_series = get_dummies(just_na_series, sparse=self.sparse)
181+
res_series_index = get_dummies(just_na_series_index, sparse=self.sparse)
180182

181183
self.assertEqual(res_list.empty, True)
182184
self.assertEqual(res_series.empty, True)
@@ -188,20 +190,21 @@ def test_just_na(self):
188190

189191
def test_include_na(self):
190192
s = ['a', 'b', np.nan]
191-
res = get_dummies(s)
193+
res = get_dummies(s, sparse=self.sparse)
192194
exp = DataFrame({'a': {0: 1.0, 1: 0.0, 2: 0.0},
193195
'b': {0: 0.0, 1: 1.0, 2: 0.0}})
194196
assert_frame_equal(res, exp)
195197

196-
res_na = get_dummies(s, dummy_na=True)
198+
# Sparse dataframes do not allow nan labelled columns, see #GH8822
199+
res_na = get_dummies(s, dummy_na=True, sparse=self.sparse)
197200
exp_na = DataFrame({nan: {0: 0.0, 1: 0.0, 2: 1.0},
198201
'a': {0: 1.0, 1: 0.0, 2: 0.0},
199202
'b': {0: 0.0, 1: 1.0, 2: 0.0}}).reindex_axis(['a', 'b', nan], 1)
200203
# hack (NaN handling in assert_index_equal)
201204
exp_na.columns = res_na.columns
202205
assert_frame_equal(res_na, exp_na)
203206

204-
res_just_na = get_dummies([nan], dummy_na=True)
207+
res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse)
205208
exp_just_na = DataFrame(Series(1.0,index=[0]),columns=[nan])
206209
assert_array_equal(res_just_na.values, exp_just_na.values)
207210

@@ -210,21 +213,21 @@ def test_unicode(self): # See GH 6885 - get_dummies chokes on unicode values
210213
e = 'e'
211214
eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE')
212215
s = [e, eacute, eacute]
213-
res = get_dummies(s, prefix='letter')
216+
res = get_dummies(s, prefix='letter', sparse=self.sparse)
214217
exp = DataFrame({'letter_e': {0: 1.0, 1: 0.0, 2: 0.0},
215218
u('letter_%s') % eacute: {0: 0.0, 1: 1.0, 2: 1.0}})
216219
assert_frame_equal(res, exp)
217220

218221
def test_dataframe_dummies_all_obj(self):
219222
df = self.df[['A', 'B']]
220-
result = get_dummies(df)
223+
result = get_dummies(df, sparse=self.sparse)
221224
expected = DataFrame({'A_a': [1., 0, 1], 'A_b': [0., 1, 0],
222225
'B_b': [1., 1, 0], 'B_c': [0., 0, 1]})
223226
assert_frame_equal(result, expected)
224227

225228
def test_dataframe_dummies_mix_default(self):
226229
df = self.df
227-
result = get_dummies(df)
230+
result = get_dummies(df, sparse=self.sparse)
228231
expected = DataFrame({'C': [1, 2, 3], 'A_a': [1., 0, 1],
229232
'A_b': [0., 1, 0], 'B_b': [1., 1, 0],
230233
'B_c': [0., 0, 1]})
@@ -235,18 +238,18 @@ def test_dataframe_dummies_prefix_list(self):
235238
prefixes = ['from_A', 'from_B']
236239
df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'],
237240
'C': [1, 2, 3]})
238-
result = get_dummies(df, prefix=prefixes)
241+
result = get_dummies(df, prefix=prefixes, sparse=self.sparse)
239242
expected = DataFrame({'C': [1, 2, 3], 'from_A_a': [1., 0, 1],
240243
'from_A_b': [0., 1, 0], 'from_B_b': [1., 1, 0],
241244
'from_B_c': [0., 0, 1]})
242245
expected = expected[['C', 'from_A_a', 'from_A_b', 'from_B_b',
243246
'from_B_c']]
244247
assert_frame_equal(result, expected)
245248

246-
def test_datafrmae_dummies_prefix_str(self):
249+
def test_dataframe_dummies_prefix_str(self):
247250
# not that you should do this...
248251
df = self.df
249-
result = get_dummies(df, prefix='bad')
252+
result = get_dummies(df, prefix='bad', sparse=self.sparse)
250253
expected = DataFrame([[1, 1., 0., 1., 0.],
251254
[2, 0., 1., 1., 0.],
252255
[3, 1., 0., 0., 1.]],
@@ -256,40 +259,40 @@ def test_datafrmae_dummies_prefix_str(self):
256259
def test_dataframe_dummies_subset(self):
257260
df = self.df
258261
result = get_dummies(df, prefix=['from_A'],
259-
columns=['A'])
262+
columns=['A'], sparse=self.sparse)
260263
expected = DataFrame({'from_A_a': [1., 0, 1], 'from_A_b': [0., 1, 0],
261264
'B': ['b', 'b', 'c'], 'C': [1, 2, 3]})
262265
assert_frame_equal(result, expected)
263266

264267
def test_dataframe_dummies_prefix_sep(self):
265268
df = self.df
266-
result = get_dummies(df, prefix_sep='..')
269+
result = get_dummies(df, prefix_sep='..', sparse=self.sparse)
267270
expected = DataFrame({'C': [1, 2, 3], 'A..a': [1., 0, 1],
268271
'A..b': [0., 1, 0], 'B..b': [1., 1, 0],
269272
'B..c': [0., 0, 1]})
270273
expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']]
271274
assert_frame_equal(result, expected)
272275

273-
result = get_dummies(df, prefix_sep=['..', '__'])
276+
result = get_dummies(df, prefix_sep=['..', '__'], sparse=self.sparse)
274277
expected = expected.rename(columns={'B..b': 'B__b', 'B..c': 'B__c'})
275278
assert_frame_equal(result, expected)
276279

277-
result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'})
280+
result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'}, sparse=self.sparse)
278281
assert_frame_equal(result, expected)
279282

280283
def test_dataframe_dummies_prefix_bad_length(self):
281284
with tm.assertRaises(ValueError):
282-
get_dummies(self.df, prefix=['too few'])
285+
get_dummies(self.df, prefix=['too few'], sparse=self.sparse)
283286

284287
def test_dataframe_dummies_prefix_sep_bad_length(self):
285288
with tm.assertRaises(ValueError):
286-
get_dummies(self.df, prefix_sep=['bad'])
289+
get_dummies(self.df, prefix_sep=['bad'], sparse=self.sparse)
287290

288291
def test_dataframe_dummies_prefix_dict(self):
289292
prefixes = {'A': 'from_A', 'B': 'from_B'}
290293
df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'],
291294
'C': [1, 2, 3]})
292-
result = get_dummies(df, prefix=prefixes)
295+
result = get_dummies(df, prefix=prefixes, sparse=self.sparse)
293296
expected = DataFrame({'from_A_a': [1., 0, 1], 'from_A_b': [0., 1, 0],
294297
'from_B_b': [1., 1, 0], 'from_B_c': [0., 0, 1],
295298
'C': [1, 2, 3]})
@@ -298,22 +301,22 @@ def test_dataframe_dummies_prefix_dict(self):
298301
def test_dataframe_dummies_with_na(self):
299302
df = self.df
300303
df.loc[3, :] = [np.nan, np.nan, np.nan]
301-
result = get_dummies(df, dummy_na=True)
304+
result = get_dummies(df, dummy_na=True, sparse=self.sparse)
302305
expected = DataFrame({'C': [1, 2, 3, np.nan], 'A_a': [1., 0, 1, 0],
303306
'A_b': [0., 1, 0, 0], 'A_nan': [0., 0, 0, 1], 'B_b': [1., 1, 0, 0],
304307
'B_c': [0., 0, 1, 0], 'B_nan': [0., 0, 0, 1]})
305308
expected = expected[['C', 'A_a', 'A_b', 'A_nan', 'B_b', 'B_c',
306309
'B_nan']]
307310
assert_frame_equal(result, expected)
308311

309-
result = get_dummies(df, dummy_na=False)
312+
result = get_dummies(df, dummy_na=False, sparse=self.sparse)
310313
expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']]
311314
assert_frame_equal(result, expected)
312315

313316
def test_dataframe_dummies_with_categorical(self):
314317
df = self.df
315318
df['cat'] = pd.Categorical(['x', 'y', 'y'])
316-
result = get_dummies(df)
319+
result = get_dummies(df, sparse=self.sparse)
317320
expected = DataFrame({'C': [1, 2, 3], 'A_a': [1., 0, 1],
318321
'A_b': [0., 1, 0], 'B_b': [1., 1, 0],
319322
'B_c': [0., 0, 1], 'cat_x': [1., 0, 0],
@@ -322,6 +325,37 @@ def test_dataframe_dummies_with_categorical(self):
322325
'cat_x', 'cat_y']]
323326
assert_frame_equal(result, expected)
324327

328+
329+
class TestGetDummiesSparse(TestGetDummies):
330+
sparse = True
331+
332+
class TestConvertDummies(tm.TestCase):
333+
def test_convert_dummies(self):
334+
df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
335+
'foo', 'bar', 'foo', 'foo'],
336+
'B': ['one', 'one', 'two', 'three',
337+
'two', 'two', 'one', 'three'],
338+
'C': np.random.randn(8),
339+
'D': np.random.randn(8)})
340+
341+
with tm.assert_produces_warning(FutureWarning):
342+
result = convert_dummies(df, ['A', 'B'])
343+
result2 = convert_dummies(df, ['A', 'B'], prefix_sep='.')
344+
345+
expected = DataFrame({'A_foo': [1, 0, 1, 0, 1, 0, 1, 1],
346+
'A_bar': [0, 1, 0, 1, 0, 1, 0, 0],
347+
'B_one': [1, 1, 0, 0, 0, 0, 1, 0],
348+
'B_two': [0, 0, 1, 0, 1, 1, 0, 0],
349+
'B_three': [0, 0, 0, 1, 0, 0, 0, 1],
350+
'C': df['C'].values,
351+
'D': df['D'].values},
352+
columns=result.columns, dtype=float)
353+
expected2 = expected.rename(columns=lambda x: x.replace('_', '.'))
354+
355+
tm.assert_frame_equal(result, expected)
356+
tm.assert_frame_equal(result2, expected2)
357+
358+
325359
class TestLreshape(tm.TestCase):
326360

327361
def test_pairs(self):

0 commit comments

Comments
 (0)