ENH: DataFrame.unstack and Series.unstack now take fill_value kw for filling NaN when unstack results in a sparse DataFrame

amcpherson · jreback · commit de460567ecde · 2016-01-30T14:27:51.000-05:00
closes #9746 closes #10246
diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst
@@ -228,6 +228,27 @@ which level in the columns to stack:
    df2.stack('exp')
    df2.stack('animal')
 
+Unstacking can result in missing values if subgroups do not have the same
+set of labels.  By default, missing values will be replaced with the default
+fill value for that data type, ``NaN`` for float, ``NaT`` for datetimelike,
+etc.  For integer types, by default data will converted to float and missing
+values will be set to ``NaN``.
+
+.. ipython:: python
+
+   df3 = df.iloc[[0, 1, 4, 7], [1, 2]]
+   df3
+   df3.unstack()
+
+.. versionadded: 0.18.0
+
+Alternatively, unstack takes an optional ``fill_value`` argument, for specifying
+the value of missing data.
+
+.. ipython:: python
+
+   df3.unstack(fill_value=-1e9)
+
 With a MultiIndex
 ~~~~~~~~~~~~~~~~~
 
diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt
@@ -431,6 +431,8 @@ Other API Changes
 
 - ``pandas.merge()`` and ``DataFrame.merge()`` will show a specific error message when trying to merge with an object that is not of type ``DataFrame`` or a subclass (:issue:`12081`)
 
+- ``DataFrame.unstack`` and ``Series.unstack`` now take ``fill_value`` keyword to allow direct replacement of missing values when an unstack results in missing values in the resulting ``DataFrame``. As an added benefit, specifying ``fill_value`` will preserve the data type of the original stacked data.  (:issue:`9746`)
+
 .. _whatsnew_0180.deprecations:
 
 Deprecations
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -1127,6 +1127,12 @@ def _maybe_promote(dtype, fill_value=np.nan):
                     # the proper thing to do here would probably be to upcast
                     # to object (but numpy 1.6.1 doesn't do this properly)
                     fill_value = tslib.iNaT
+            elif issubclass(dtype.type, np.timedelta64):
+                try:
+                    fill_value = lib.Timedelta(fill_value).value
+                except:
+                    # as for datetimes, cannot upcast to object
+                    fill_value = tslib.iNaT
             else:
                 fill_value = tslib.iNaT
     elif is_datetimetz(dtype):
@@ -1153,6 +1159,16 @@ def _maybe_promote(dtype, fill_value=np.nan):
             dtype = np.object_
         elif issubclass(dtype.type, (np.integer, np.floating)):
             dtype = np.complex128
+    elif fill_value is None:
+        if is_float_dtype(dtype) or is_complex_dtype(dtype):
+            fill_value = np.nan
+        elif is_integer_dtype(dtype):
+            dtype = np.float64
+            fill_value = np.nan
+        elif is_datetime_or_timedelta_dtype(dtype):
+            fill_value = tslib.iNaT
+        else:
+            dtype = np.object_
     else:
         dtype = np.object_
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -3851,7 +3851,7 @@ def stack(self, level=-1, dropna=True):
         else:
             return stack(self, level, dropna=dropna)
 
-    def unstack(self, level=-1):
+    def unstack(self, level=-1, fill_value=None):
         """
         Pivot a level of the (necessarily hierarchical) index labels, returning
         a DataFrame having a new level of column labels whose inner-most level
@@ -3864,6 +3864,10 @@ def unstack(self, level=-1):
         ----------
         level : int, string, or list of these, default -1 (last level)
             Level(s) of index to unstack, can pass level name
+        fill_value : replace NaN with this value if the unstack produces
+            missing values
+
+            .. versionadded: 0.18.0
 
         See also
         --------
@@ -3905,7 +3909,7 @@ def unstack(self, level=-1):
         unstacked : DataFrame or Series
         """
         from pandas.core.reshape import unstack
-        return unstack(self, level)
+        return unstack(self, level, fill_value)
 
     # ----------------------------------------------------------------------
     # Time series-related
diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py
@@ -60,7 +60,8 @@ class _Unstacker(object):
     unstacked : DataFrame
     """
 
-    def __init__(self, values, index, level=-1, value_columns=None):
+    def __init__(self, values, index, level=-1, value_columns=None,
+                 fill_value=None):
 
         self.is_categorical = None
         if values.ndim == 1:
@@ -70,6 +71,7 @@ def __init__(self, values, index, level=-1, value_columns=None):
             values = values[:, np.newaxis]
         self.values = values
         self.value_columns = value_columns
+        self.fill_value = fill_value
 
         if value_columns is None and values.shape[1] != 1:  # pragma: no cover
             raise ValueError('must pass column labels for multi-column data')
@@ -178,7 +180,7 @@ def get_new_values(self):
             dtype = values.dtype
             new_values = np.empty(result_shape, dtype=dtype)
         else:
-            dtype, fill_value = _maybe_promote(values.dtype)
+            dtype, fill_value = _maybe_promote(values.dtype, self.fill_value)
             new_values = np.empty(result_shape, dtype=dtype)
             new_values.fill(fill_value)
 
@@ -389,21 +391,22 @@ def _slow_pivot(index, columns, values):
     return DataFrame(tree)
 
 
-def unstack(obj, level):
+def unstack(obj, level, fill_value=None):
     if isinstance(level, (tuple, list)):
         return _unstack_multiple(obj, level)
 
     if isinstance(obj, DataFrame):
         if isinstance(obj.index, MultiIndex):
-            return _unstack_frame(obj, level)
+            return _unstack_frame(obj, level, fill_value=fill_value)
         else:
             return obj.T.stack(dropna=False)
     else:
-        unstacker = _Unstacker(obj.values, obj.index, level=level)
+        unstacker = _Unstacker(obj.values, obj.index, level=level,
+                               fill_value=fill_value)
         return unstacker.get_result()
 
 
-def _unstack_frame(obj, level):
+def _unstack_frame(obj, level, fill_value=None):
     from pandas.core.internals import BlockManager, make_block
 
     if obj._is_mixed_type:
@@ -419,7 +422,8 @@ def _unstack_frame(obj, level):
         for blk in obj._data.blocks:
             blk_items = obj._data.items[blk.mgr_locs.indexer]
             bunstacker = _Unstacker(blk.values.T, obj.index, level=level,
-                                    value_columns=blk_items)
+                                    value_columns=blk_items,
+                                    fill_value=fill_value)
             new_items = bunstacker.get_new_columns()
             new_placement = new_columns.get_indexer(new_items)
             new_values, mask = bunstacker.get_new_values()
@@ -435,7 +439,8 @@ def _unstack_frame(obj, level):
         return result.ix[:, mask_frame.sum(0) > 0]
     else:
         unstacker = _Unstacker(obj.values, obj.index, level=level,
-                               value_columns=obj.columns)
+                               value_columns=obj.columns,
+                               fill_value=fill_value)
         return unstacker.get_result()
 
 
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -2003,7 +2003,7 @@ def reorder_levels(self, order):
         result.index = result.index.reorder_levels(order)
         return result
 
-    def unstack(self, level=-1):
+    def unstack(self, level=-1, fill_value=None):
         """
         Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame.
         The level involved will automatically get sorted.
@@ -2012,6 +2012,10 @@ def unstack(self, level=-1):
         ----------
         level : int, string, or list of these, default last level
             Level(s) to unstack, can pass level name
+        fill_value : replace NaN with this value if the unstack produces
+            missing values
+
+            .. versionadded: 0.18.0
 
         Examples
         --------
@@ -2036,7 +2040,7 @@ def unstack(self, level=-1):
         unstacked : DataFrame
         """
         from pandas.core.reshape import unstack
-        return unstack(self, level)
+        return unstack(self, level, fill_value)
 
     # ----------------------------------------------------------------------
     # function application
diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py
@@ -10,7 +10,7 @@
 import numpy as np
 
 from pandas.compat import u
-from pandas import DataFrame, Index, Series, MultiIndex, date_range
+from pandas import DataFrame, Index, Series, MultiIndex, date_range, Timedelta, Period
 import pandas as pd
 
 from pandas.util.testing import (assert_series_equal,
@@ -136,6 +136,141 @@ def test_stack_unstack(self):
         assert_frame_equal(unstacked_cols.T, self.frame)
         assert_frame_equal(unstacked_cols_df['bar'].T, self.frame)
 
+    def test_unstack_fill(self):
+
+        # GH #9746: fill_value keyword argument for Series
+        # and DataFrame unstack
+
+        # From a series
+        data = Series([1, 2, 4, 5], dtype=np.int16)
+        data.index = MultiIndex.from_tuples(
+            [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
+
+        result = data.unstack(fill_value=-1)
+        expected = DataFrame({'a': [1, -1, 5], 'b': [2, 4, -1]},
+                             index=['x', 'y', 'z'], dtype=np.int16)
+        assert_frame_equal(result, expected)
+
+        # From a series with incorrect data type for fill_value
+        result = data.unstack(fill_value=0.5)
+        expected = DataFrame({'a': [1, 0.5, 5], 'b': [2, 4, 0.5]},
+                             index=['x', 'y', 'z'], dtype=np.float)
+        assert_frame_equal(result, expected)
+
+        # From a dataframe
+        rows = [[1, 2], [3, 4], [5, 6], [7, 8]]
+        df = DataFrame(rows, columns=list('AB'), dtype=np.int32)
+        df.index = MultiIndex.from_tuples(
+            [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
+
+        result = df.unstack(fill_value=-1)
+
+        rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]]
+        expected = DataFrame(rows, index=list('xyz'), dtype=np.int32)
+        expected.columns = MultiIndex.from_tuples(
+            [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
+        assert_frame_equal(result, expected)
+
+        # From a mixed type dataframe
+        df['A'] = df['A'].astype(np.int16)
+        df['B'] = df['B'].astype(np.float64)
+
+        result = df.unstack(fill_value=-1)
+        expected['A'] = expected['A'].astype(np.int16)
+        expected['B'] = expected['B'].astype(np.float64)
+        assert_frame_equal(result, expected)
+
+        # From a dataframe with incorrect data type for fill_value
+        result = df.unstack(fill_value=0.5)
+
+        rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]]
+        expected = DataFrame(rows, index=list('xyz'), dtype=np.float)
+        expected.columns = MultiIndex.from_tuples(
+            [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
+        assert_frame_equal(result, expected)
+
+        # Test unstacking with date times
+        dv = pd.date_range('2012-01-01', periods=4).values
+        data = Series(dv)
+        data.index = MultiIndex.from_tuples(
+            [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
+
+        result = data.unstack()
+        expected = DataFrame({'a': [dv[0], pd.NaT, dv[3]],
+                              'b': [dv[1], dv[2], pd.NaT]},
+                             index=['x', 'y', 'z'])
+        assert_frame_equal(result, expected)
+
+        result = data.unstack(fill_value=dv[0])
+        expected = DataFrame({'a': [dv[0], dv[0], dv[3]],
+                              'b': [dv[1], dv[2], dv[0]]},
+                             index=['x', 'y', 'z'])
+        assert_frame_equal(result, expected)
+
+        # Test unstacking with time deltas
+        td = [Timedelta(days=i) for i in range(4)]
+        data = Series(td)
+        data.index = MultiIndex.from_tuples(
+            [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
+
+        result = data.unstack()
+        expected = DataFrame({'a': [td[0], pd.NaT, td[3]],
+                              'b': [td[1], td[2], pd.NaT]},
+                             index=['x', 'y', 'z'])
+        assert_frame_equal(result, expected)
+
+        result = data.unstack(fill_value=td[1])
+        expected = DataFrame({'a': [td[0], td[1], td[3]],
+                              'b': [td[1], td[2], td[1]]},
+                             index=['x', 'y', 'z'])
+        assert_frame_equal(result, expected)
+
+        # Test unstacking with period
+        periods = [Period('2012-01'), Period('2012-02'), Period('2012-03'),
+                   Period('2012-04')]
+        data = Series(periods)
+        data.index = MultiIndex.from_tuples(
+            [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
+
+        result = data.unstack()
+        expected = DataFrame({'a': [periods[0], None, periods[3]],
+                              'b': [periods[1], periods[2], None]},
+                             index=['x', 'y', 'z'])
+        assert_frame_equal(result, expected)
+
+        result = data.unstack(fill_value=periods[1])
+        expected = DataFrame({'a': [periods[0], periods[1], periods[3]],
+                              'b': [periods[1], periods[2], periods[1]]},
+                             index=['x', 'y', 'z'])
+        assert_frame_equal(result, expected)
+
+        # Test unstacking with categorical
+        data = pd.Series(['a', 'b', 'c', 'a'], dtype='category')
+        data.index = pd.MultiIndex.from_tuples(
+            [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
+
+        # By default missing values will be NaN
+        result = data.unstack()
+        expected = DataFrame({'a': pd.Categorical(list('axa'),
+                                                  categories=list('abc')),
+                              'b': pd.Categorical(list('bcx'),
+                                                  categories=list('abc'))},
+                             index=list('xyz'))
+        assert_frame_equal(result, expected)
+
+        # Fill with non-category results in NaN entries similar to above
+        result = data.unstack(fill_value='d')
+        assert_frame_equal(result, expected)
+
+        # Fill with category value replaces missing values as expected
+        result = data.unstack(fill_value='c')
+        expected = DataFrame({'a': pd.Categorical(list('aca'),
+                                                  categories=list('abc')),
+                              'b': pd.Categorical(list('bcc'),
+                                                  categories=list('abc'))},
+                             index=list('xyz'))
+        assert_frame_equal(result, expected)
+
     def test_stack_ints(self):
         df = DataFrame(
             np.random.randn(30, 27),