From 7c2f9d1b7677f9dec4512bd3283c0a27646973c7 Mon Sep 17 00:00:00 2001 From: Andrew McPherson Date: Mon, 1 Jun 2015 07:45:06 -0700 Subject: [PATCH] ENH: GH9746 DataFrame.unstack and Series.unstack now take fill_value kw for filling NaN when unstack results in a sparse DataFrame --- doc/source/reshaping.rst | 21 +++++ doc/source/whatsnew/v0.18.0.txt | 4 + pandas/core/common.py | 16 ++++ pandas/core/frame.py | 8 +- pandas/core/reshape.py | 21 +++-- pandas/core/series.py | 8 +- pandas/tests/frame/test_reshape.py | 137 ++++++++++++++++++++++++++++- 7 files changed, 202 insertions(+), 13 deletions(-) diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index dbf3b838593a9..f3990faec9e1a 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -228,6 +228,27 @@ which level in the columns to stack: df2.stack('exp') df2.stack('animal') +Unstacking can result in missing values if subgroups do not have the same +set of labels. By default, missing values will be replaced with the default +fill value for that data type, ``NaN`` for float, ``NaT`` for datetimelike, +etc. For integer types, by default data will converted to float and missing +values will be set to ``NaN``. + +.. ipython:: python + + df3 = df.iloc[[0, 1, 4, 7], [1, 2]] + df3 + df3.unstack() + +.. versionadded: 0.18.0 + +Alternatively, unstack takes an optional ``fill_value`` argument, for specifying +the value of missing data. + +.. ipython:: python + + df3.unstack(fill_value=-1e9) + With a MultiIndex ~~~~~~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index 58b60fb08920a..a4f1f0ee2e67e 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -431,6 +431,10 @@ Other API Changes - ``pandas.merge()`` and ``DataFrame.merge()`` will show a specific error message when trying to merge with an object that is not of type ``DataFrame`` or a subclass (:issue:`12081`) +- ``DataFrame.unstack`` and ``Series.unstack`` now take ``fill_value`` keyword to allow direct replacement of + missing values when an unstack results in missing values in the resulting ``DataFrame``. As an added benefit, + specifying ``fill_value`` will preserve the data type of the original stacked data. (:issue:`9746`) + .. _whatsnew_0180.deprecations: Deprecations diff --git a/pandas/core/common.py b/pandas/core/common.py index 439913a0f5b44..508765896e275 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -1127,6 +1127,12 @@ def _maybe_promote(dtype, fill_value=np.nan): # the proper thing to do here would probably be to upcast # to object (but numpy 1.6.1 doesn't do this properly) fill_value = tslib.iNaT + elif issubclass(dtype.type, np.timedelta64): + try: + fill_value = lib.Timedelta(fill_value).value + except: + # as for datetimes, cannot upcast to object + fill_value = tslib.iNaT else: fill_value = tslib.iNaT elif is_datetimetz(dtype): @@ -1153,6 +1159,16 @@ def _maybe_promote(dtype, fill_value=np.nan): dtype = np.object_ elif issubclass(dtype.type, (np.integer, np.floating)): dtype = np.complex128 + elif fill_value is None: + if is_float_dtype(dtype) or is_complex_dtype(dtype): + fill_value = np.nan + elif is_integer_dtype(dtype): + dtype = np.float64 + fill_value = np.nan + elif is_datetime_or_timedelta_dtype(dtype): + fill_value = tslib.iNaT + else: + dtype = np.object_ else: dtype = np.object_ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 17092b7be9f62..41a4cd0d77508 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3851,7 +3851,7 @@ def stack(self, level=-1, dropna=True): else: return stack(self, level, dropna=dropna) - def unstack(self, level=-1): + def unstack(self, level=-1, fill_value=None): """ Pivot a level of the (necessarily hierarchical) index labels, returning a DataFrame having a new level of column labels whose inner-most level @@ -3864,6 +3864,10 @@ def unstack(self, level=-1): ---------- level : int, string, or list of these, default -1 (last level) Level(s) of index to unstack, can pass level name + fill_value : replace NaN with this value if the unstack produces + missing values + + .. versionadded: 0.18.0 See also -------- @@ -3905,7 +3909,7 @@ def unstack(self, level=-1): unstacked : DataFrame or Series """ from pandas.core.reshape import unstack - return unstack(self, level) + return unstack(self, level, fill_value) # ---------------------------------------------------------------------- # Time series-related diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 4dffaa0b0c416..05257dd0ac625 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -60,7 +60,8 @@ class _Unstacker(object): unstacked : DataFrame """ - def __init__(self, values, index, level=-1, value_columns=None): + def __init__(self, values, index, level=-1, value_columns=None, + fill_value=None): self.is_categorical = None if values.ndim == 1: @@ -70,6 +71,7 @@ def __init__(self, values, index, level=-1, value_columns=None): values = values[:, np.newaxis] self.values = values self.value_columns = value_columns + self.fill_value = fill_value if value_columns is None and values.shape[1] != 1: # pragma: no cover raise ValueError('must pass column labels for multi-column data') @@ -178,7 +180,7 @@ def get_new_values(self): dtype = values.dtype new_values = np.empty(result_shape, dtype=dtype) else: - dtype, fill_value = _maybe_promote(values.dtype) + dtype, fill_value = _maybe_promote(values.dtype, self.fill_value) new_values = np.empty(result_shape, dtype=dtype) new_values.fill(fill_value) @@ -389,21 +391,22 @@ def _slow_pivot(index, columns, values): return DataFrame(tree) -def unstack(obj, level): +def unstack(obj, level, fill_value=None): if isinstance(level, (tuple, list)): return _unstack_multiple(obj, level) if isinstance(obj, DataFrame): if isinstance(obj.index, MultiIndex): - return _unstack_frame(obj, level) + return _unstack_frame(obj, level, fill_value=fill_value) else: return obj.T.stack(dropna=False) else: - unstacker = _Unstacker(obj.values, obj.index, level=level) + unstacker = _Unstacker(obj.values, obj.index, level=level, + fill_value=fill_value) return unstacker.get_result() -def _unstack_frame(obj, level): +def _unstack_frame(obj, level, fill_value=None): from pandas.core.internals import BlockManager, make_block if obj._is_mixed_type: @@ -419,7 +422,8 @@ def _unstack_frame(obj, level): for blk in obj._data.blocks: blk_items = obj._data.items[blk.mgr_locs.indexer] bunstacker = _Unstacker(blk.values.T, obj.index, level=level, - value_columns=blk_items) + value_columns=blk_items, + fill_value=fill_value) new_items = bunstacker.get_new_columns() new_placement = new_columns.get_indexer(new_items) new_values, mask = bunstacker.get_new_values() @@ -435,7 +439,8 @@ def _unstack_frame(obj, level): return result.ix[:, mask_frame.sum(0) > 0] else: unstacker = _Unstacker(obj.values, obj.index, level=level, - value_columns=obj.columns) + value_columns=obj.columns, + fill_value=fill_value) return unstacker.get_result() diff --git a/pandas/core/series.py b/pandas/core/series.py index 699a0ca66f5f9..49182951c0e9d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2003,7 +2003,7 @@ def reorder_levels(self, order): result.index = result.index.reorder_levels(order) return result - def unstack(self, level=-1): + def unstack(self, level=-1, fill_value=None): """ Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame. The level involved will automatically get sorted. @@ -2012,6 +2012,10 @@ def unstack(self, level=-1): ---------- level : int, string, or list of these, default last level Level(s) to unstack, can pass level name + fill_value : replace NaN with this value if the unstack produces + missing values + + .. versionadded: 0.18.0 Examples -------- @@ -2036,7 +2040,7 @@ def unstack(self, level=-1): unstacked : DataFrame """ from pandas.core.reshape import unstack - return unstack(self, level) + return unstack(self, level, fill_value) # ---------------------------------------------------------------------- # function application diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index c030a6a71c7b8..c0963d885a08d 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -10,7 +10,7 @@ import numpy as np from pandas.compat import u -from pandas import DataFrame, Index, Series, MultiIndex, date_range +from pandas import DataFrame, Index, Series, MultiIndex, date_range, Timedelta, Period import pandas as pd from pandas.util.testing import (assert_series_equal, @@ -136,6 +136,141 @@ def test_stack_unstack(self): assert_frame_equal(unstacked_cols.T, self.frame) assert_frame_equal(unstacked_cols_df['bar'].T, self.frame) + def test_unstack_fill(self): + + # GH #9746: fill_value keyword argument for Series + # and DataFrame unstack + + # From a series + data = Series([1, 2, 4, 5], dtype=np.int16) + data.index = MultiIndex.from_tuples( + [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) + + result = data.unstack(fill_value=-1) + expected = DataFrame({'a': [1, -1, 5], 'b': [2, 4, -1]}, + index=['x', 'y', 'z'], dtype=np.int16) + assert_frame_equal(result, expected) + + # From a series with incorrect data type for fill_value + result = data.unstack(fill_value=0.5) + expected = DataFrame({'a': [1, 0.5, 5], 'b': [2, 4, 0.5]}, + index=['x', 'y', 'z'], dtype=np.float) + assert_frame_equal(result, expected) + + # From a dataframe + rows = [[1, 2], [3, 4], [5, 6], [7, 8]] + df = DataFrame(rows, columns=list('AB'), dtype=np.int32) + df.index = MultiIndex.from_tuples( + [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) + + result = df.unstack(fill_value=-1) + + rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]] + expected = DataFrame(rows, index=list('xyz'), dtype=np.int32) + expected.columns = MultiIndex.from_tuples( + [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')]) + assert_frame_equal(result, expected) + + # From a mixed type dataframe + df['A'] = df['A'].astype(np.int16) + df['B'] = df['B'].astype(np.float64) + + result = df.unstack(fill_value=-1) + expected['A'] = expected['A'].astype(np.int16) + expected['B'] = expected['B'].astype(np.float64) + assert_frame_equal(result, expected) + + # From a dataframe with incorrect data type for fill_value + result = df.unstack(fill_value=0.5) + + rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]] + expected = DataFrame(rows, index=list('xyz'), dtype=np.float) + expected.columns = MultiIndex.from_tuples( + [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')]) + assert_frame_equal(result, expected) + + # Test unstacking with date times + dv = pd.date_range('2012-01-01', periods=4).values + data = Series(dv) + data.index = MultiIndex.from_tuples( + [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) + + result = data.unstack() + expected = DataFrame({'a': [dv[0], pd.NaT, dv[3]], + 'b': [dv[1], dv[2], pd.NaT]}, + index=['x', 'y', 'z']) + assert_frame_equal(result, expected) + + result = data.unstack(fill_value=dv[0]) + expected = DataFrame({'a': [dv[0], dv[0], dv[3]], + 'b': [dv[1], dv[2], dv[0]]}, + index=['x', 'y', 'z']) + assert_frame_equal(result, expected) + + # Test unstacking with time deltas + td = [Timedelta(days=i) for i in range(4)] + data = Series(td) + data.index = MultiIndex.from_tuples( + [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) + + result = data.unstack() + expected = DataFrame({'a': [td[0], pd.NaT, td[3]], + 'b': [td[1], td[2], pd.NaT]}, + index=['x', 'y', 'z']) + assert_frame_equal(result, expected) + + result = data.unstack(fill_value=td[1]) + expected = DataFrame({'a': [td[0], td[1], td[3]], + 'b': [td[1], td[2], td[1]]}, + index=['x', 'y', 'z']) + assert_frame_equal(result, expected) + + # Test unstacking with period + periods = [Period('2012-01'), Period('2012-02'), Period('2012-03'), + Period('2012-04')] + data = Series(periods) + data.index = MultiIndex.from_tuples( + [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) + + result = data.unstack() + expected = DataFrame({'a': [periods[0], None, periods[3]], + 'b': [periods[1], periods[2], None]}, + index=['x', 'y', 'z']) + assert_frame_equal(result, expected) + + result = data.unstack(fill_value=periods[1]) + expected = DataFrame({'a': [periods[0], periods[1], periods[3]], + 'b': [periods[1], periods[2], periods[1]]}, + index=['x', 'y', 'z']) + assert_frame_equal(result, expected) + + # Test unstacking with categorical + data = pd.Series(['a', 'b', 'c', 'a'], dtype='category') + data.index = pd.MultiIndex.from_tuples( + [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) + + # By default missing values will be NaN + result = data.unstack() + expected = DataFrame({'a': pd.Categorical(list('axa'), + categories=list('abc')), + 'b': pd.Categorical(list('bcx'), + categories=list('abc'))}, + index=list('xyz')) + assert_frame_equal(result, expected) + + # Fill with non-category results in NaN entries similar to above + result = data.unstack(fill_value='d') + assert_frame_equal(result, expected) + + # Fill with category value replaces missing values as expected + result = data.unstack(fill_value='c') + expected = DataFrame({'a': pd.Categorical(list('aca'), + categories=list('abc')), + 'b': pd.Categorical(list('bcc'), + categories=list('abc'))}, + index=list('xyz')) + assert_frame_equal(result, expected) + def test_stack_ints(self): df = DataFrame( np.random.randn(30, 27),