diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index df34c78f4408e..a6b973fb82073 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -707,6 +707,51 @@ Finally, a ``Series.sparse`` accessor was added to provide sparse-specific metho s = pd.Series([0, 0, 1, 1, 1], dtype='Sparse[int]') s.sparse.density +.. _whatsnew_0240.api_breaking.get_dummies: + +:meth:`get_dummies` always returns a DataFrame +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously, when ``sparse=True`` was passed to :func:`get_dummies`, the return value could be either +a :class:`DataFrame` or a :class:`SparseDataFrame`, depending on whether all or a just a subset +of the columns were dummy-encoded. Now, a :class:`DataFrame` is always returned (:issue:`24284`). + +*Previous Behavior* + +The first :func:`get_dummies` returns a :class:`DataFrame` because the column ``A`` +is not dummy encoded. When just ``["B", "C"]`` are passed to ``get_dummies``, +then all the columns are dummy-encoded, and a :class:`SparseDataFrame` was returned. + +.. code-block:: ipython + + In [2]: df = pd.DataFrame({"A": [1, 2], "B": ['a', 'b'], "C": ['a', 'a']}) + + In [3]: type(pd.get_dummies(df, sparse=True)) + Out[3]: pandas.core.frame.DataFrame + + In [4]: type(pd.get_dummies(df[['B', 'C']], sparse=True)) + Out[4]: pandas.core.sparse.frame.SparseDataFrame + +.. ipython:: python + :suppress: + + df = pd.DataFrame({"A": [1, 2], "B": ['a', 'b'], "C": ['a', 'a']}) + +*New Behavior* + +Now, the return type is consistently a :class:`DataFrame`. + +.. ipython:: python + + type(pd.get_dummies(df, sparse=True)) + type(pd.get_dummies(df[['B', 'C']], sparse=True)) + +.. note:: + + There's no difference in memory usage between a :class:`SparseDataFrame` + and a :class:`DataFrame` with sparse values. The memory usage will + be the same as in the previous version of pandas. + .. _whatsnew_0240.api_breaking.frame_to_dict_index_orient: Raise ValueError in ``DataFrame.to_dict(orient='index')`` diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 2bd7e2c0b9b82..8319a8cc5417c 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -25,7 +25,6 @@ from pandas.core.sorting import ( compress_group_index, decons_obs_group_ids, get_compressed_ids, get_group_index) -from pandas.core.sparse.api import SparseDataFrame, SparseSeries class _Unstacker(object): @@ -706,9 +705,8 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, If `columns` is None then all the columns with `object` or `category` dtype will be converted. sparse : bool, default False - Whether the dummy columns should be sparse or not. Returns - SparseDataFrame if `data` is a Series or if all columns are included. - Otherwise returns a DataFrame with some SparseBlocks. + Whether the dummy-encoded columns should be be backed by + a :class:`SparseArray` (True) or a regular NumPy array (False). drop_first : bool, default False Whether to get k-1 dummies out of k categorical levels by removing the first level. @@ -722,7 +720,7 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, Returns ------- - dummies : DataFrame or SparseDataFrame + dummies : DataFrame See Also -------- @@ -865,19 +863,16 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, if is_object_dtype(dtype): raise ValueError("dtype=object is not a valid dtype for get_dummies") - def get_empty_Frame(data, sparse): + def get_empty_frame(data): if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) - if not sparse: - return DataFrame(index=index) - else: - return SparseDataFrame(index=index, default_fill_value=0) + return DataFrame(index=index) # if all NaN if not dummy_na and len(levels) == 0: - return get_empty_Frame(data, sparse) + return get_empty_frame(data) codes = codes.copy() if dummy_na: @@ -886,7 +881,7 @@ def get_empty_Frame(data, sparse): # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: - return get_empty_Frame(data, sparse) + return get_empty_frame(data) number_of_cols = len(levels) @@ -933,11 +928,10 @@ def _make_col_name(prefix, prefix_sep, level): sarr = SparseArray(np.ones(len(ixs), dtype=dtype), sparse_index=IntIndex(N, ixs), fill_value=0, dtype=dtype) - sparse_series[col] = SparseSeries(data=sarr, index=index) + sparse_series[col] = Series(data=sarr, index=index) - out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols, - default_fill_value=0, - dtype=dtype) + out = DataFrame(sparse_series, index=index, columns=dummy_cols, + dtype=dtype) return out else: diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 0d26e9c375d0d..edbe70d308b96 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -5,6 +5,7 @@ from collections import OrderedDict from pandas import DataFrame, Series +from pandas.core.dtypes.common import is_integer_dtype from pandas.core.sparse.api import SparseDtype, SparseArray import pandas as pd @@ -54,23 +55,16 @@ def test_basic(self, sparse, dtype): 'b': [0, 1, 0], 'c': [0, 0, 1]}, dtype=self.effective_dtype(dtype)) - result = get_dummies(s_list, sparse=sparse, dtype=dtype) if sparse: - tm.assert_sp_frame_equal(result, - expected.to_sparse(kind='integer', - fill_value=0)) - else: - assert_frame_equal(result, expected) + expected = expected.apply(pd.SparseArray, fill_value=0.0) + result = get_dummies(s_list, sparse=sparse, dtype=dtype) + assert_frame_equal(result, expected) result = get_dummies(s_series, sparse=sparse, dtype=dtype) - if sparse: - expected = expected.to_sparse(kind='integer', fill_value=0) assert_frame_equal(result, expected) expected.index = list('ABC') result = get_dummies(s_series_index, sparse=sparse, dtype=dtype) - if sparse: - expected.to_sparse(kind='integer', fill_value=0) assert_frame_equal(result, expected) def test_basic_types(self, sparse, dtype): @@ -86,23 +80,27 @@ def test_basic_types(self, sparse, dtype): 'c': [0, 0, 1]}, dtype=self.effective_dtype(dtype), columns=list('abc')) - if not sparse: - compare = tm.assert_frame_equal - else: - expected = expected.to_sparse(fill_value=0, kind='integer') - compare = tm.assert_sp_frame_equal - + if sparse: + if is_integer_dtype(dtype): + fill_value = 0 + elif dtype == bool: + fill_value = False + else: + fill_value = 0.0 + + expected = expected.apply(SparseArray, fill_value=fill_value) result = get_dummies(s_list, sparse=sparse, dtype=dtype) - compare(result, expected) + tm.assert_frame_equal(result, expected) result = get_dummies(s_series, sparse=sparse, dtype=dtype) - compare(result, expected) + tm.assert_frame_equal(result, expected) result = get_dummies(s_df, columns=s_df.columns, sparse=sparse, dtype=dtype) if sparse: - dtype_name = 'Sparse[{}, 0]'.format( - self.effective_dtype(dtype).name + dtype_name = 'Sparse[{}, {}]'.format( + self.effective_dtype(dtype).name, + fill_value ) else: dtype_name = self.effective_dtype(dtype).name @@ -137,14 +135,13 @@ def test_just_na(self, sparse): assert res_series_index.index.tolist() == ['A'] def test_include_na(self, sparse, dtype): - if sparse: - pytest.xfail(reason='nan in index is problematic (GH 16894)') - s = ['a', 'b', np.nan] res = get_dummies(s, sparse=sparse, dtype=dtype) exp = DataFrame({'a': [1, 0, 0], 'b': [0, 1, 0]}, dtype=self.effective_dtype(dtype)) + if sparse: + exp = exp.apply(pd.SparseArray, fill_value=0.0) assert_frame_equal(res, exp) # Sparse dataframes do not allow nan labelled columns, see #GH8822 @@ -156,6 +153,8 @@ def test_include_na(self, sparse, dtype): exp_na = exp_na.reindex(['a', 'b', nan], axis=1) # hack (NaN handling in assert_index_equal) exp_na.columns = res_na.columns + if sparse: + exp_na = exp_na.apply(pd.SparseArray, fill_value=0.0) assert_frame_equal(res_na, exp_na) res_just_na = get_dummies([nan], dummy_na=True, @@ -175,10 +174,8 @@ def test_unicode(self, sparse): u('letter_%s') % eacute: [0, 1, 1]}, dtype=np.uint8) if sparse: - tm.assert_sp_frame_equal(res, exp.to_sparse(fill_value=0, - kind='integer')) - else: - assert_frame_equal(res, exp) + exp = exp.apply(pd.SparseArray, fill_value=0) + assert_frame_equal(res, exp) def test_dataframe_dummies_all_obj(self, df, sparse): df = df[['A', 'B']] @@ -189,16 +186,14 @@ def test_dataframe_dummies_all_obj(self, df, sparse): 'B_c': [0, 0, 1]}, dtype=np.uint8) if sparse: - expected = pd.SparseDataFrame({ + expected = pd.DataFrame({ "A_a": pd.SparseArray([1, 0, 1], dtype='uint8'), "A_b": pd.SparseArray([0, 1, 0], dtype='uint8'), "B_b": pd.SparseArray([1, 1, 0], dtype='uint8'), "B_c": pd.SparseArray([0, 0, 1], dtype='uint8'), }) - tm.assert_sp_frame_equal(result, expected) - else: - assert_frame_equal(result, expected) + assert_frame_equal(result, expected) def test_dataframe_dummies_mix_default(self, df, sparse, dtype): result = get_dummies(df, sparse=sparse, dtype=dtype) @@ -402,7 +397,7 @@ def test_basic_drop_first(self, sparse): result = get_dummies(s_list, drop_first=True, sparse=sparse) if sparse: - expected = expected.to_sparse(fill_value=0, kind='integer') + expected = expected.apply(pd.SparseArray, fill_value=0) assert_frame_equal(result, expected) result = get_dummies(s_series, drop_first=True, sparse=sparse) @@ -436,7 +431,7 @@ def test_basic_drop_first_NA(self, sparse): res = get_dummies(s_NA, drop_first=True, sparse=sparse) exp = DataFrame({'b': [0, 1, 0]}, dtype=np.uint8) if sparse: - exp = exp.to_sparse(fill_value=0, kind='integer') + exp = exp.apply(pd.SparseArray, fill_value=0) assert_frame_equal(res, exp) @@ -447,7 +442,7 @@ def test_basic_drop_first_NA(self, sparse): nan: [0, 0, 1]}, dtype=np.uint8).reindex(['b', nan], axis=1) if sparse: - exp_na = exp_na.to_sparse(fill_value=0, kind='integer') + exp_na = exp_na.apply(pd.SparseArray, fill_value=0) assert_frame_equal(res_na, exp_na) res_just_na = get_dummies([nan], dummy_na=True, drop_first=True, @@ -462,7 +457,7 @@ def test_dataframe_dummies_drop_first(self, df, sparse): 'B_c': [0, 0, 1]}, dtype=np.uint8) if sparse: - expected = expected.to_sparse(fill_value=0, kind='integer') + expected = expected.apply(pd.SparseArray, fill_value=0) assert_frame_equal(result, expected) def test_dataframe_dummies_drop_first_with_categorical(