Skip to content

Commit 81bd50e

Browse files
author
Artemy Kolchinsky
committed
ENH: Allow get_dummies to return sparse dataframe
ENH: Allow get_dummies to return sparse dataframe Fix Fix Fixes
1 parent b56fefe commit 81bd50e

File tree

3 files changed

+336
-254
lines changed

3 files changed

+336
-254
lines changed

pandas/core/reshape.py

+42-16
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99
from pandas.core.series import Series
1010
from pandas.core.frame import DataFrame
1111

12+
from pandas.core.sparse import SparseDataFrame, SparseSeries
13+
from pandas._sparse import IntIndex
14+
1215
from pandas.core.categorical import Categorical
1316
from pandas.core.common import (notnull, _ensure_platform_int, _maybe_promote,
1417
isnull)
@@ -1005,7 +1008,7 @@ def convert_dummies(data, cat_variables, prefix_sep='_'):
10051008

10061009

10071010
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
1008-
columns=None):
1011+
columns=None, sparse=False):
10091012
"""
10101013
Convert categorical variable into dummy/indicator variables
10111014
@@ -1026,6 +1029,8 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
10261029
Column names in the DataFrame to be encoded.
10271030
If `columns` is None then all the columns with
10281031
`object` or `category` dtype will be converted.
1032+
sparse : bool, default False
1033+
Whether the returned DataFrame should be sparse or not.
10291034
10301035
Returns
10311036
-------
@@ -1112,16 +1117,17 @@ def check_len(item, name):
11121117
with_dummies = [result]
11131118
for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep):
11141119

1115-
dummy = _get_dummies_1d(data[col], prefix=pre,
1116-
prefix_sep=sep, dummy_na=dummy_na)
1120+
dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep,
1121+
dummy_na=dummy_na, sparse=sparse)
11171122
with_dummies.append(dummy)
11181123
result = concat(with_dummies, axis=1)
11191124
else:
1120-
result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na)
1125+
result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na,
1126+
sparse=sparse)
11211127
return result
11221128

11231129

1124-
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False):
1130+
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False):
11251131
# Series avoids inconsistent NaN handling
11261132
cat = Categorical.from_array(Series(data))
11271133
levels = cat.categories
@@ -1132,19 +1138,17 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False):
11321138
index = data.index
11331139
else:
11341140
index = np.arange(len(data))
1135-
return DataFrame(index=index)
1136-
1137-
number_of_cols = len(levels)
1138-
if dummy_na:
1139-
number_of_cols += 1
1140-
1141-
dummy_mat = np.eye(number_of_cols).take(cat.codes, axis=0)
1141+
if not sparse:
1142+
return DataFrame(index=index)
1143+
else:
1144+
return SparseDataFrame(index=index)
11421145

1146+
codes = cat.codes.copy()
11431147
if dummy_na:
1148+
codes[codes == -1] = len(cat.categories)
11441149
levels = np.append(cat.categories, np.nan)
1145-
else:
1146-
# reset NaN GH4446
1147-
dummy_mat[cat.codes == -1] = 0
1150+
1151+
number_of_cols = len(levels)
11481152

11491153
if prefix is not None:
11501154
dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v)
@@ -1157,7 +1161,29 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False):
11571161
else:
11581162
index = None
11591163

1160-
return DataFrame(dummy_mat, index=index, columns=dummy_cols)
1164+
if sparse:
1165+
sparse_series = {}
1166+
N = len(data)
1167+
for code, col in enumerate(dummy_cols):
1168+
if code != -1:
1169+
sp_index = np.flatnonzero(codes == code)
1170+
sp_data = np.ones(len(sp_index))
1171+
else: # Blank entries if not dummy_na and code == -1, #GH4446
1172+
sp_index, sp_data = [], []
1173+
1174+
sparse_series[col] = SparseSeries(data=np.array(sp_data),
1175+
sparse_index=IntIndex(N, sp_index), index=index, fill_value=0)
1176+
1177+
return SparseDataFrame(sparse_series, index=index, default_fill_value=0)
1178+
1179+
else:
1180+
dummy_mat = np.eye(number_of_cols).take(codes, axis=0)
1181+
1182+
if not dummy_na:
1183+
# reset NaN GH4446
1184+
dummy_mat[codes == -1] = 0
1185+
1186+
return DataFrame(dummy_mat, index=index, columns=dummy_cols)
11611187

11621188

11631189
def make_axis_dummies(frame, axis='minor', transform=None):

0 commit comments

Comments
 (0)