Skip to content

API: Always return DataFrame from get_dummies #24284

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Dec 15, 2018
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -708,6 +708,51 @@ Finally, a ``Series.sparse`` accessor was added to provide sparse-specific metho
s = pd.Series([0, 0, 1, 1, 1], dtype='Sparse[int]')
s.sparse.density

.. _whatsnew_0240.api_breaking.get_dummies:

:meth:`get_dummies` always returns a DataFrame
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Previously, when ``sparse=True`` was passed to :func:`get_dummies`, the return value could be either
a :class:`DataFrame` or a :class:`SparseDataFrame`, depending on whether all or a just a subset
of the columns were dummy-encoded. Now, a :class:`DataFrame` is always returned.

*Previous Behavior*

The first :func:`get_dummies` returns a :class:`DataFrame` because the column ``A``
is not dummy encoded. When just ``["B", "C"]`` are passed to ``get_dummies``,
then all the columns are dummy-encoded, and a :class:`SparseDataFrame` was returned.

.. ipython:: python
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

code-block


In [2]: df = pd.DataFrame({"A": [1, 2], "B": ['a', 'b'], "C": ['a', 'a']})

In [3]: type(pd.get_dummies(df, sparse=True))
Out[3]: pandas.core.frame.DataFrame

In [4]: type(pd.get_dummies(df[['B', 'C']], sparse=True))
Out[4]: pandas.core.sparse.frame.SparseDataFrame

.. ipython:: python
:suppress:

df = pd.DataFrame({"A": [1, 2], "B": ['a', 'b'], "C": ['a', 'a']})

*New Behavior*

Now, the return type is consistently a :class:`DataFrame`.

.. ipython:: python

type(pd.get_dummies(df, sparse=True))
type(pd.get_dummies(df[['B', 'C']], sparse=True))

.. note::

There's no difference in memory usage between a :class:`SparseDataFrame`
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you might need to update the existing docs slightly and/or change the usage in previous whatsnew notes.

and a :class:`DataFrame` with sparse values. The memory usage will
be the same as in the previous version of pandas.

.. _whatsnew_0240.api_breaking.frame_to_dict_index_orient:

Raise ValueError in ``DataFrame.to_dict(orient='index')``
Expand Down
26 changes: 10 additions & 16 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
from pandas.core.sorting import (
compress_group_index, decons_obs_group_ids, get_compressed_ids,
get_group_index)
from pandas.core.sparse.api import SparseDataFrame, SparseSeries


class _Unstacker(object):
Expand Down Expand Up @@ -706,9 +705,8 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
If `columns` is None then all the columns with
`object` or `category` dtype will be converted.
sparse : bool, default False
Whether the dummy columns should be sparse or not. Returns
SparseDataFrame if `data` is a Series or if all columns are included.
Otherwise returns a DataFrame with some SparseBlocks.
Whether the dummy-encoded columns should be be backed by
a :class:`SparseArray` (True) or a regular NumPy array (False).
drop_first : bool, default False
Whether to get k-1 dummies out of k categorical levels by removing the
first level.
Expand All @@ -722,7 +720,7 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,

Returns
-------
dummies : DataFrame or SparseDataFrame
dummies : DataFrame

See Also
--------
Expand Down Expand Up @@ -865,19 +863,16 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
if is_object_dtype(dtype):
raise ValueError("dtype=object is not a valid dtype for get_dummies")

def get_empty_Frame(data, sparse):
def get_empty_Frame(data):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lowercase this, maybe make a module level function.

if isinstance(data, Series):
index = data.index
else:
index = np.arange(len(data))
if not sparse:
return DataFrame(index=index)
else:
return SparseDataFrame(index=index, default_fill_value=0)
return DataFrame(index=index)

# if all NaN
if not dummy_na and len(levels) == 0:
return get_empty_Frame(data, sparse)
return get_empty_Frame(data)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

did you mean to capitalize? (or was that before)


codes = codes.copy()
if dummy_na:
Expand All @@ -886,7 +881,7 @@ def get_empty_Frame(data, sparse):

# if dummy_na, we just fake a nan level. drop_first will drop it again
if drop_first and len(levels) == 1:
return get_empty_Frame(data, sparse)
return get_empty_Frame(data)

number_of_cols = len(levels)

Expand Down Expand Up @@ -933,11 +928,10 @@ def _make_col_name(prefix, prefix_sep, level):
sarr = SparseArray(np.ones(len(ixs), dtype=dtype),
sparse_index=IntIndex(N, ixs), fill_value=0,
dtype=dtype)
sparse_series[col] = SparseSeries(data=sarr, index=index)
sparse_series[col] = Series(data=sarr, index=index)

out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols,
default_fill_value=0,
dtype=dtype)
out = DataFrame(sparse_series, index=index, columns=dummy_cols,
dtype=dtype)
return out

else:
Expand Down
65 changes: 30 additions & 35 deletions pandas/tests/reshape/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from collections import OrderedDict

from pandas import DataFrame, Series
from pandas.core.dtypes.common import is_integer_dtype
from pandas.core.sparse.api import SparseDtype, SparseArray
import pandas as pd

Expand Down Expand Up @@ -54,23 +55,16 @@ def test_basic(self, sparse, dtype):
'b': [0, 1, 0],
'c': [0, 0, 1]},
dtype=self.effective_dtype(dtype))
result = get_dummies(s_list, sparse=sparse, dtype=dtype)
if sparse:
tm.assert_sp_frame_equal(result,
expected.to_sparse(kind='integer',
fill_value=0))
else:
assert_frame_equal(result, expected)
expected = expected.apply(pd.SparseArray, fill_value=0.0)
result = get_dummies(s_list, sparse=sparse, dtype=dtype)
assert_frame_equal(result, expected)

result = get_dummies(s_series, sparse=sparse, dtype=dtype)
if sparse:
expected = expected.to_sparse(kind='integer', fill_value=0)
assert_frame_equal(result, expected)

expected.index = list('ABC')
result = get_dummies(s_series_index, sparse=sparse, dtype=dtype)
if sparse:
expected.to_sparse(kind='integer', fill_value=0)
assert_frame_equal(result, expected)

def test_basic_types(self, sparse, dtype):
Expand All @@ -86,23 +80,27 @@ def test_basic_types(self, sparse, dtype):
'c': [0, 0, 1]},
dtype=self.effective_dtype(dtype),
columns=list('abc'))
if not sparse:
compare = tm.assert_frame_equal
else:
expected = expected.to_sparse(fill_value=0, kind='integer')
compare = tm.assert_sp_frame_equal

if sparse:
if is_integer_dtype(dtype):
fill_value = 0
elif dtype == bool:
fill_value = False
else:
fill_value = 0.0

expected = expected.apply(SparseArray, fill_value=fill_value)
result = get_dummies(s_list, sparse=sparse, dtype=dtype)
compare(result, expected)
tm.assert_frame_equal(result, expected)

result = get_dummies(s_series, sparse=sparse, dtype=dtype)
compare(result, expected)
tm.assert_frame_equal(result, expected)

result = get_dummies(s_df, columns=s_df.columns,
sparse=sparse, dtype=dtype)
if sparse:
dtype_name = 'Sparse[{}, 0]'.format(
self.effective_dtype(dtype).name
dtype_name = 'Sparse[{}, {}]'.format(
self.effective_dtype(dtype).name,
fill_value
)
else:
dtype_name = self.effective_dtype(dtype).name
Expand Down Expand Up @@ -137,14 +135,13 @@ def test_just_na(self, sparse):
assert res_series_index.index.tolist() == ['A']

def test_include_na(self, sparse, dtype):
if sparse:
pytest.xfail(reason='nan in index is problematic (GH 16894)')

s = ['a', 'b', np.nan]
res = get_dummies(s, sparse=sparse, dtype=dtype)
exp = DataFrame({'a': [1, 0, 0],
'b': [0, 1, 0]},
dtype=self.effective_dtype(dtype))
if sparse:
exp = exp.apply(pd.SparseArray, fill_value=0.0)
assert_frame_equal(res, exp)

# Sparse dataframes do not allow nan labelled columns, see #GH8822
Expand All @@ -156,6 +153,8 @@ def test_include_na(self, sparse, dtype):
exp_na = exp_na.reindex(['a', 'b', nan], axis=1)
# hack (NaN handling in assert_index_equal)
exp_na.columns = res_na.columns
if sparse:
exp_na = exp_na.apply(pd.SparseArray, fill_value=0.0)
assert_frame_equal(res_na, exp_na)

res_just_na = get_dummies([nan], dummy_na=True,
Expand All @@ -175,10 +174,8 @@ def test_unicode(self, sparse):
u('letter_%s') % eacute: [0, 1, 1]},
dtype=np.uint8)
if sparse:
tm.assert_sp_frame_equal(res, exp.to_sparse(fill_value=0,
kind='integer'))
else:
assert_frame_equal(res, exp)
exp = exp.apply(pd.SparseArray, fill_value=0)
assert_frame_equal(res, exp)

def test_dataframe_dummies_all_obj(self, df, sparse):
df = df[['A', 'B']]
Expand All @@ -189,16 +186,14 @@ def test_dataframe_dummies_all_obj(self, df, sparse):
'B_c': [0, 0, 1]},
dtype=np.uint8)
if sparse:
expected = pd.SparseDataFrame({
expected = pd.DataFrame({
"A_a": pd.SparseArray([1, 0, 1], dtype='uint8'),
"A_b": pd.SparseArray([0, 1, 0], dtype='uint8'),
"B_b": pd.SparseArray([1, 1, 0], dtype='uint8'),
"B_c": pd.SparseArray([0, 0, 1], dtype='uint8'),
})

tm.assert_sp_frame_equal(result, expected)
else:
assert_frame_equal(result, expected)
assert_frame_equal(result, expected)

def test_dataframe_dummies_mix_default(self, df, sparse, dtype):
result = get_dummies(df, sparse=sparse, dtype=dtype)
Expand Down Expand Up @@ -402,7 +397,7 @@ def test_basic_drop_first(self, sparse):

result = get_dummies(s_list, drop_first=True, sparse=sparse)
if sparse:
expected = expected.to_sparse(fill_value=0, kind='integer')
expected = expected.apply(pd.SparseArray, fill_value=0)
assert_frame_equal(result, expected)

result = get_dummies(s_series, drop_first=True, sparse=sparse)
Expand Down Expand Up @@ -436,7 +431,7 @@ def test_basic_drop_first_NA(self, sparse):
res = get_dummies(s_NA, drop_first=True, sparse=sparse)
exp = DataFrame({'b': [0, 1, 0]}, dtype=np.uint8)
if sparse:
exp = exp.to_sparse(fill_value=0, kind='integer')
exp = exp.apply(pd.SparseArray, fill_value=0)

assert_frame_equal(res, exp)

Expand All @@ -447,7 +442,7 @@ def test_basic_drop_first_NA(self, sparse):
nan: [0, 0, 1]},
dtype=np.uint8).reindex(['b', nan], axis=1)
if sparse:
exp_na = exp_na.to_sparse(fill_value=0, kind='integer')
exp_na = exp_na.apply(pd.SparseArray, fill_value=0)
assert_frame_equal(res_na, exp_na)

res_just_na = get_dummies([nan], dummy_na=True, drop_first=True,
Expand All @@ -462,7 +457,7 @@ def test_dataframe_dummies_drop_first(self, df, sparse):
'B_c': [0, 0, 1]},
dtype=np.uint8)
if sparse:
expected = expected.to_sparse(fill_value=0, kind='integer')
expected = expected.apply(pd.SparseArray, fill_value=0)
assert_frame_equal(result, expected)

def test_dataframe_dummies_drop_first_with_categorical(
Expand Down