Skip to content

Commit cd3af7e

Browse files
TomAugspurgerPingviinituutti
authored andcommitted
API: Always return DataFrame from get_dummies (pandas-dev#24284)
1 parent bf67998 commit cd3af7e

File tree

3 files changed

+85
-51
lines changed

3 files changed

+85
-51
lines changed

doc/source/whatsnew/v0.24.0.rst

+45
Original file line numberDiff line numberDiff line change
@@ -707,6 +707,51 @@ Finally, a ``Series.sparse`` accessor was added to provide sparse-specific metho
707707
s = pd.Series([0, 0, 1, 1, 1], dtype='Sparse[int]')
708708
s.sparse.density
709709
710+
.. _whatsnew_0240.api_breaking.get_dummies:
711+
712+
:meth:`get_dummies` always returns a DataFrame
713+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
714+
715+
Previously, when ``sparse=True`` was passed to :func:`get_dummies`, the return value could be either
716+
a :class:`DataFrame` or a :class:`SparseDataFrame`, depending on whether all or a just a subset
717+
of the columns were dummy-encoded. Now, a :class:`DataFrame` is always returned (:issue:`24284`).
718+
719+
*Previous Behavior*
720+
721+
The first :func:`get_dummies` returns a :class:`DataFrame` because the column ``A``
722+
is not dummy encoded. When just ``["B", "C"]`` are passed to ``get_dummies``,
723+
then all the columns are dummy-encoded, and a :class:`SparseDataFrame` was returned.
724+
725+
.. code-block:: ipython
726+
727+
In [2]: df = pd.DataFrame({"A": [1, 2], "B": ['a', 'b'], "C": ['a', 'a']})
728+
729+
In [3]: type(pd.get_dummies(df, sparse=True))
730+
Out[3]: pandas.core.frame.DataFrame
731+
732+
In [4]: type(pd.get_dummies(df[['B', 'C']], sparse=True))
733+
Out[4]: pandas.core.sparse.frame.SparseDataFrame
734+
735+
.. ipython:: python
736+
:suppress:
737+
738+
df = pd.DataFrame({"A": [1, 2], "B": ['a', 'b'], "C": ['a', 'a']})
739+
740+
*New Behavior*
741+
742+
Now, the return type is consistently a :class:`DataFrame`.
743+
744+
.. ipython:: python
745+
746+
type(pd.get_dummies(df, sparse=True))
747+
type(pd.get_dummies(df[['B', 'C']], sparse=True))
748+
749+
.. note::
750+
751+
There's no difference in memory usage between a :class:`SparseDataFrame`
752+
and a :class:`DataFrame` with sparse values. The memory usage will
753+
be the same as in the previous version of pandas.
754+
710755
.. _whatsnew_0240.api_breaking.frame_to_dict_index_orient:
711756

712757
Raise ValueError in ``DataFrame.to_dict(orient='index')``

pandas/core/reshape/reshape.py

+10-16
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
from pandas.core.sorting import (
2626
compress_group_index, decons_obs_group_ids, get_compressed_ids,
2727
get_group_index)
28-
from pandas.core.sparse.api import SparseDataFrame, SparseSeries
2928

3029

3130
class _Unstacker(object):
@@ -706,9 +705,8 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
706705
If `columns` is None then all the columns with
707706
`object` or `category` dtype will be converted.
708707
sparse : bool, default False
709-
Whether the dummy columns should be sparse or not. Returns
710-
SparseDataFrame if `data` is a Series or if all columns are included.
711-
Otherwise returns a DataFrame with some SparseBlocks.
708+
Whether the dummy-encoded columns should be be backed by
709+
a :class:`SparseArray` (True) or a regular NumPy array (False).
712710
drop_first : bool, default False
713711
Whether to get k-1 dummies out of k categorical levels by removing the
714712
first level.
@@ -722,7 +720,7 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
722720
723721
Returns
724722
-------
725-
dummies : DataFrame or SparseDataFrame
723+
dummies : DataFrame
726724
727725
See Also
728726
--------
@@ -865,19 +863,16 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
865863
if is_object_dtype(dtype):
866864
raise ValueError("dtype=object is not a valid dtype for get_dummies")
867865

868-
def get_empty_Frame(data, sparse):
866+
def get_empty_frame(data):
869867
if isinstance(data, Series):
870868
index = data.index
871869
else:
872870
index = np.arange(len(data))
873-
if not sparse:
874-
return DataFrame(index=index)
875-
else:
876-
return SparseDataFrame(index=index, default_fill_value=0)
871+
return DataFrame(index=index)
877872

878873
# if all NaN
879874
if not dummy_na and len(levels) == 0:
880-
return get_empty_Frame(data, sparse)
875+
return get_empty_frame(data)
881876

882877
codes = codes.copy()
883878
if dummy_na:
@@ -886,7 +881,7 @@ def get_empty_Frame(data, sparse):
886881

887882
# if dummy_na, we just fake a nan level. drop_first will drop it again
888883
if drop_first and len(levels) == 1:
889-
return get_empty_Frame(data, sparse)
884+
return get_empty_frame(data)
890885

891886
number_of_cols = len(levels)
892887

@@ -933,11 +928,10 @@ def _make_col_name(prefix, prefix_sep, level):
933928
sarr = SparseArray(np.ones(len(ixs), dtype=dtype),
934929
sparse_index=IntIndex(N, ixs), fill_value=0,
935930
dtype=dtype)
936-
sparse_series[col] = SparseSeries(data=sarr, index=index)
931+
sparse_series[col] = Series(data=sarr, index=index)
937932

938-
out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols,
939-
default_fill_value=0,
940-
dtype=dtype)
933+
out = DataFrame(sparse_series, index=index, columns=dummy_cols,
934+
dtype=dtype)
941935
return out
942936

943937
else:

pandas/tests/reshape/test_reshape.py

+30-35
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from collections import OrderedDict
66

77
from pandas import DataFrame, Series
8+
from pandas.core.dtypes.common import is_integer_dtype
89
from pandas.core.sparse.api import SparseDtype, SparseArray
910
import pandas as pd
1011

@@ -54,23 +55,16 @@ def test_basic(self, sparse, dtype):
5455
'b': [0, 1, 0],
5556
'c': [0, 0, 1]},
5657
dtype=self.effective_dtype(dtype))
57-
result = get_dummies(s_list, sparse=sparse, dtype=dtype)
5858
if sparse:
59-
tm.assert_sp_frame_equal(result,
60-
expected.to_sparse(kind='integer',
61-
fill_value=0))
62-
else:
63-
assert_frame_equal(result, expected)
59+
expected = expected.apply(pd.SparseArray, fill_value=0.0)
60+
result = get_dummies(s_list, sparse=sparse, dtype=dtype)
61+
assert_frame_equal(result, expected)
6462

6563
result = get_dummies(s_series, sparse=sparse, dtype=dtype)
66-
if sparse:
67-
expected = expected.to_sparse(kind='integer', fill_value=0)
6864
assert_frame_equal(result, expected)
6965

7066
expected.index = list('ABC')
7167
result = get_dummies(s_series_index, sparse=sparse, dtype=dtype)
72-
if sparse:
73-
expected.to_sparse(kind='integer', fill_value=0)
7468
assert_frame_equal(result, expected)
7569

7670
def test_basic_types(self, sparse, dtype):
@@ -86,23 +80,27 @@ def test_basic_types(self, sparse, dtype):
8680
'c': [0, 0, 1]},
8781
dtype=self.effective_dtype(dtype),
8882
columns=list('abc'))
89-
if not sparse:
90-
compare = tm.assert_frame_equal
91-
else:
92-
expected = expected.to_sparse(fill_value=0, kind='integer')
93-
compare = tm.assert_sp_frame_equal
94-
83+
if sparse:
84+
if is_integer_dtype(dtype):
85+
fill_value = 0
86+
elif dtype == bool:
87+
fill_value = False
88+
else:
89+
fill_value = 0.0
90+
91+
expected = expected.apply(SparseArray, fill_value=fill_value)
9592
result = get_dummies(s_list, sparse=sparse, dtype=dtype)
96-
compare(result, expected)
93+
tm.assert_frame_equal(result, expected)
9794

9895
result = get_dummies(s_series, sparse=sparse, dtype=dtype)
99-
compare(result, expected)
96+
tm.assert_frame_equal(result, expected)
10097

10198
result = get_dummies(s_df, columns=s_df.columns,
10299
sparse=sparse, dtype=dtype)
103100
if sparse:
104-
dtype_name = 'Sparse[{}, 0]'.format(
105-
self.effective_dtype(dtype).name
101+
dtype_name = 'Sparse[{}, {}]'.format(
102+
self.effective_dtype(dtype).name,
103+
fill_value
106104
)
107105
else:
108106
dtype_name = self.effective_dtype(dtype).name
@@ -137,14 +135,13 @@ def test_just_na(self, sparse):
137135
assert res_series_index.index.tolist() == ['A']
138136

139137
def test_include_na(self, sparse, dtype):
140-
if sparse:
141-
pytest.xfail(reason='nan in index is problematic (GH 16894)')
142-
143138
s = ['a', 'b', np.nan]
144139
res = get_dummies(s, sparse=sparse, dtype=dtype)
145140
exp = DataFrame({'a': [1, 0, 0],
146141
'b': [0, 1, 0]},
147142
dtype=self.effective_dtype(dtype))
143+
if sparse:
144+
exp = exp.apply(pd.SparseArray, fill_value=0.0)
148145
assert_frame_equal(res, exp)
149146

150147
# Sparse dataframes do not allow nan labelled columns, see #GH8822
@@ -156,6 +153,8 @@ def test_include_na(self, sparse, dtype):
156153
exp_na = exp_na.reindex(['a', 'b', nan], axis=1)
157154
# hack (NaN handling in assert_index_equal)
158155
exp_na.columns = res_na.columns
156+
if sparse:
157+
exp_na = exp_na.apply(pd.SparseArray, fill_value=0.0)
159158
assert_frame_equal(res_na, exp_na)
160159

161160
res_just_na = get_dummies([nan], dummy_na=True,
@@ -175,10 +174,8 @@ def test_unicode(self, sparse):
175174
u('letter_%s') % eacute: [0, 1, 1]},
176175
dtype=np.uint8)
177176
if sparse:
178-
tm.assert_sp_frame_equal(res, exp.to_sparse(fill_value=0,
179-
kind='integer'))
180-
else:
181-
assert_frame_equal(res, exp)
177+
exp = exp.apply(pd.SparseArray, fill_value=0)
178+
assert_frame_equal(res, exp)
182179

183180
def test_dataframe_dummies_all_obj(self, df, sparse):
184181
df = df[['A', 'B']]
@@ -189,16 +186,14 @@ def test_dataframe_dummies_all_obj(self, df, sparse):
189186
'B_c': [0, 0, 1]},
190187
dtype=np.uint8)
191188
if sparse:
192-
expected = pd.SparseDataFrame({
189+
expected = pd.DataFrame({
193190
"A_a": pd.SparseArray([1, 0, 1], dtype='uint8'),
194191
"A_b": pd.SparseArray([0, 1, 0], dtype='uint8'),
195192
"B_b": pd.SparseArray([1, 1, 0], dtype='uint8'),
196193
"B_c": pd.SparseArray([0, 0, 1], dtype='uint8'),
197194
})
198195

199-
tm.assert_sp_frame_equal(result, expected)
200-
else:
201-
assert_frame_equal(result, expected)
196+
assert_frame_equal(result, expected)
202197

203198
def test_dataframe_dummies_mix_default(self, df, sparse, dtype):
204199
result = get_dummies(df, sparse=sparse, dtype=dtype)
@@ -402,7 +397,7 @@ def test_basic_drop_first(self, sparse):
402397

403398
result = get_dummies(s_list, drop_first=True, sparse=sparse)
404399
if sparse:
405-
expected = expected.to_sparse(fill_value=0, kind='integer')
400+
expected = expected.apply(pd.SparseArray, fill_value=0)
406401
assert_frame_equal(result, expected)
407402

408403
result = get_dummies(s_series, drop_first=True, sparse=sparse)
@@ -436,7 +431,7 @@ def test_basic_drop_first_NA(self, sparse):
436431
res = get_dummies(s_NA, drop_first=True, sparse=sparse)
437432
exp = DataFrame({'b': [0, 1, 0]}, dtype=np.uint8)
438433
if sparse:
439-
exp = exp.to_sparse(fill_value=0, kind='integer')
434+
exp = exp.apply(pd.SparseArray, fill_value=0)
440435

441436
assert_frame_equal(res, exp)
442437

@@ -447,7 +442,7 @@ def test_basic_drop_first_NA(self, sparse):
447442
nan: [0, 0, 1]},
448443
dtype=np.uint8).reindex(['b', nan], axis=1)
449444
if sparse:
450-
exp_na = exp_na.to_sparse(fill_value=0, kind='integer')
445+
exp_na = exp_na.apply(pd.SparseArray, fill_value=0)
451446
assert_frame_equal(res_na, exp_na)
452447

453448
res_just_na = get_dummies([nan], dummy_na=True, drop_first=True,
@@ -462,7 +457,7 @@ def test_dataframe_dummies_drop_first(self, df, sparse):
462457
'B_c': [0, 0, 1]},
463458
dtype=np.uint8)
464459
if sparse:
465-
expected = expected.to_sparse(fill_value=0, kind='integer')
460+
expected = expected.apply(pd.SparseArray, fill_value=0)
466461
assert_frame_equal(result, expected)
467462

468463
def test_dataframe_dummies_drop_first_with_categorical(

0 commit comments

Comments
 (0)