Skip to content

Commit ec58429

Browse files
author
Artemy Kolchinsky
committed
BUG: get_dummies not returning SparseDataFrame
Tests redo
1 parent a3cca39 commit ec58429

File tree

3 files changed

+41
-12
lines changed

3 files changed

+41
-12
lines changed

doc/source/whatsnew/v0.17.0.txt

+4-8
Original file line numberDiff line numberDiff line change
@@ -377,16 +377,12 @@ Bug Fixes
377377
- Bug in ``Series.plot(kind='hist')`` Y Label not informative (:issue:`10485`)
378378

379379

380-
381-
382-
383-
384-
385-
386-
387-
388380
- Bug in operator equal on Index not being consistent with Series (:issue:`9947`)
389381

390382
- Reading "famafrench" data via ``DataReader`` results in HTTP 404 error because of the website url is changed (:issue:`10591`).
391383

392384
- Bug in `read_msgpack` where DataFrame to decode has duplicate column names (:issue:`9618`)
385+
386+
387+
- Bug in `get_dummies` with `sparse=True` not returning SparseDataFrame (:issue:`10531`)
388+

pandas/core/reshape.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -957,13 +957,15 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
957957
If `columns` is None then all the columns with
958958
`object` or `category` dtype will be converted.
959959
sparse : bool, default False
960-
Whether the returned DataFrame should be sparse or not.
960+
Whether the dummy columns should be sparse or not. Returns
961+
SparseDataFrame if `data` is a Series or if all columns are included.
962+
Otherwise returns a DataFrame with some SparseBlocks.
961963
962964
.. versionadded:: 0.16.1
963965
964966
Returns
965967
-------
966-
dummies : DataFrame
968+
dummies : DataFrame or SparseDataFrame
967969
968970
Examples
969971
--------
@@ -1042,8 +1044,11 @@ def check_len(item, name):
10421044
elif isinstance(prefix_sep, dict):
10431045
prefix_sep = [prefix_sep[col] for col in columns_to_encode]
10441046

1045-
result = data.drop(columns_to_encode, axis=1)
1046-
with_dummies = [result]
1047+
if set(columns_to_encode) == set(data.columns):
1048+
with_dummies = []
1049+
else:
1050+
with_dummies = [data.drop(columns_to_encode, axis=1)]
1051+
10471052
for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep):
10481053

10491054
dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep,

pandas/tests/test_reshape.py

+28
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import nose
99

1010
from pandas import DataFrame, Series
11+
from pandas.core.sparse import SparseDataFrame
1112
import pandas as pd
1213

1314
from numpy import nan
@@ -171,6 +172,33 @@ def test_basic(self):
171172
expected.index = list('ABC')
172173
assert_frame_equal(get_dummies(s_series_index, sparse=self.sparse), expected)
173174

175+
def test_basic_types(self):
176+
# GH 10531
177+
s_list = list('abc')
178+
s_series = Series(s_list)
179+
s_df = DataFrame({'a': [0, 1, 0, 1, 2],
180+
'b': ['A', 'A', 'B', 'C', 'C'],
181+
'c': [2, 3, 3, 3, 2]})
182+
183+
if not self.sparse:
184+
exp_df_type = DataFrame
185+
exp_blk_type = pd.core.internals.FloatBlock
186+
else:
187+
exp_df_type = SparseDataFrame
188+
exp_blk_type = pd.core.internals.SparseBlock
189+
190+
self.assertEqual(type(get_dummies(s_list, sparse=self.sparse)), exp_df_type)
191+
self.assertEqual(type(get_dummies(s_series, sparse=self.sparse)), exp_df_type)
192+
193+
r = get_dummies(s_df, sparse=self.sparse, columns=s_df.columns)
194+
self.assertEqual(type(r), exp_df_type)
195+
196+
r = get_dummies(s_df, sparse=self.sparse, columns=['a'])
197+
self.assertEqual(type(r[['a_0']]._data.blocks[0]), exp_blk_type)
198+
self.assertEqual(type(r[['a_1']]._data.blocks[0]), exp_blk_type)
199+
self.assertEqual(type(r[['a_2']]._data.blocks[0]), exp_blk_type)
200+
201+
174202
def test_just_na(self):
175203
just_na_list = [np.nan]
176204
just_na_series = Series(just_na_list)

0 commit comments

Comments
 (0)