Skip to content

Commit de46056

Browse files
amcphersonjreback
authored andcommitted
ENH: DataFrame.unstack and Series.unstack now take fill_value kw for filling NaN when unstack results in a sparse DataFrame
closes #9746 closes #10246
1 parent 059ffaa commit de46056

File tree

7 files changed

+200
-13
lines changed

7 files changed

+200
-13
lines changed

doc/source/reshaping.rst

+21
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,27 @@ which level in the columns to stack:
228228
df2.stack('exp')
229229
df2.stack('animal')
230230
231+
Unstacking can result in missing values if subgroups do not have the same
232+
set of labels. By default, missing values will be replaced with the default
233+
fill value for that data type, ``NaN`` for float, ``NaT`` for datetimelike,
234+
etc. For integer types, by default data will converted to float and missing
235+
values will be set to ``NaN``.
236+
237+
.. ipython:: python
238+
239+
df3 = df.iloc[[0, 1, 4, 7], [1, 2]]
240+
df3
241+
df3.unstack()
242+
243+
.. versionadded: 0.18.0
244+
245+
Alternatively, unstack takes an optional ``fill_value`` argument, for specifying
246+
the value of missing data.
247+
248+
.. ipython:: python
249+
250+
df3.unstack(fill_value=-1e9)
251+
231252
With a MultiIndex
232253
~~~~~~~~~~~~~~~~~
233254

doc/source/whatsnew/v0.18.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -431,6 +431,8 @@ Other API Changes
431431

432432
- ``pandas.merge()`` and ``DataFrame.merge()`` will show a specific error message when trying to merge with an object that is not of type ``DataFrame`` or a subclass (:issue:`12081`)
433433

434+
- ``DataFrame.unstack`` and ``Series.unstack`` now take ``fill_value`` keyword to allow direct replacement of missing values when an unstack results in missing values in the resulting ``DataFrame``. As an added benefit, specifying ``fill_value`` will preserve the data type of the original stacked data. (:issue:`9746`)
435+
434436
.. _whatsnew_0180.deprecations:
435437

436438
Deprecations

pandas/core/common.py

+16
Original file line numberDiff line numberDiff line change
@@ -1127,6 +1127,12 @@ def _maybe_promote(dtype, fill_value=np.nan):
11271127
# the proper thing to do here would probably be to upcast
11281128
# to object (but numpy 1.6.1 doesn't do this properly)
11291129
fill_value = tslib.iNaT
1130+
elif issubclass(dtype.type, np.timedelta64):
1131+
try:
1132+
fill_value = lib.Timedelta(fill_value).value
1133+
except:
1134+
# as for datetimes, cannot upcast to object
1135+
fill_value = tslib.iNaT
11301136
else:
11311137
fill_value = tslib.iNaT
11321138
elif is_datetimetz(dtype):
@@ -1153,6 +1159,16 @@ def _maybe_promote(dtype, fill_value=np.nan):
11531159
dtype = np.object_
11541160
elif issubclass(dtype.type, (np.integer, np.floating)):
11551161
dtype = np.complex128
1162+
elif fill_value is None:
1163+
if is_float_dtype(dtype) or is_complex_dtype(dtype):
1164+
fill_value = np.nan
1165+
elif is_integer_dtype(dtype):
1166+
dtype = np.float64
1167+
fill_value = np.nan
1168+
elif is_datetime_or_timedelta_dtype(dtype):
1169+
fill_value = tslib.iNaT
1170+
else:
1171+
dtype = np.object_
11561172
else:
11571173
dtype = np.object_
11581174

pandas/core/frame.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -3851,7 +3851,7 @@ def stack(self, level=-1, dropna=True):
38513851
else:
38523852
return stack(self, level, dropna=dropna)
38533853

3854-
def unstack(self, level=-1):
3854+
def unstack(self, level=-1, fill_value=None):
38553855
"""
38563856
Pivot a level of the (necessarily hierarchical) index labels, returning
38573857
a DataFrame having a new level of column labels whose inner-most level
@@ -3864,6 +3864,10 @@ def unstack(self, level=-1):
38643864
----------
38653865
level : int, string, or list of these, default -1 (last level)
38663866
Level(s) of index to unstack, can pass level name
3867+
fill_value : replace NaN with this value if the unstack produces
3868+
missing values
3869+
3870+
.. versionadded: 0.18.0
38673871
38683872
See also
38693873
--------
@@ -3905,7 +3909,7 @@ def unstack(self, level=-1):
39053909
unstacked : DataFrame or Series
39063910
"""
39073911
from pandas.core.reshape import unstack
3908-
return unstack(self, level)
3912+
return unstack(self, level, fill_value)
39093913

39103914
# ----------------------------------------------------------------------
39113915
# Time series-related

pandas/core/reshape.py

+13-8
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,8 @@ class _Unstacker(object):
6060
unstacked : DataFrame
6161
"""
6262

63-
def __init__(self, values, index, level=-1, value_columns=None):
63+
def __init__(self, values, index, level=-1, value_columns=None,
64+
fill_value=None):
6465

6566
self.is_categorical = None
6667
if values.ndim == 1:
@@ -70,6 +71,7 @@ def __init__(self, values, index, level=-1, value_columns=None):
7071
values = values[:, np.newaxis]
7172
self.values = values
7273
self.value_columns = value_columns
74+
self.fill_value = fill_value
7375

7476
if value_columns is None and values.shape[1] != 1: # pragma: no cover
7577
raise ValueError('must pass column labels for multi-column data')
@@ -178,7 +180,7 @@ def get_new_values(self):
178180
dtype = values.dtype
179181
new_values = np.empty(result_shape, dtype=dtype)
180182
else:
181-
dtype, fill_value = _maybe_promote(values.dtype)
183+
dtype, fill_value = _maybe_promote(values.dtype, self.fill_value)
182184
new_values = np.empty(result_shape, dtype=dtype)
183185
new_values.fill(fill_value)
184186

@@ -389,21 +391,22 @@ def _slow_pivot(index, columns, values):
389391
return DataFrame(tree)
390392

391393

392-
def unstack(obj, level):
394+
def unstack(obj, level, fill_value=None):
393395
if isinstance(level, (tuple, list)):
394396
return _unstack_multiple(obj, level)
395397

396398
if isinstance(obj, DataFrame):
397399
if isinstance(obj.index, MultiIndex):
398-
return _unstack_frame(obj, level)
400+
return _unstack_frame(obj, level, fill_value=fill_value)
399401
else:
400402
return obj.T.stack(dropna=False)
401403
else:
402-
unstacker = _Unstacker(obj.values, obj.index, level=level)
404+
unstacker = _Unstacker(obj.values, obj.index, level=level,
405+
fill_value=fill_value)
403406
return unstacker.get_result()
404407

405408

406-
def _unstack_frame(obj, level):
409+
def _unstack_frame(obj, level, fill_value=None):
407410
from pandas.core.internals import BlockManager, make_block
408411

409412
if obj._is_mixed_type:
@@ -419,7 +422,8 @@ def _unstack_frame(obj, level):
419422
for blk in obj._data.blocks:
420423
blk_items = obj._data.items[blk.mgr_locs.indexer]
421424
bunstacker = _Unstacker(blk.values.T, obj.index, level=level,
422-
value_columns=blk_items)
425+
value_columns=blk_items,
426+
fill_value=fill_value)
423427
new_items = bunstacker.get_new_columns()
424428
new_placement = new_columns.get_indexer(new_items)
425429
new_values, mask = bunstacker.get_new_values()
@@ -435,7 +439,8 @@ def _unstack_frame(obj, level):
435439
return result.ix[:, mask_frame.sum(0) > 0]
436440
else:
437441
unstacker = _Unstacker(obj.values, obj.index, level=level,
438-
value_columns=obj.columns)
442+
value_columns=obj.columns,
443+
fill_value=fill_value)
439444
return unstacker.get_result()
440445

441446

pandas/core/series.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -2003,7 +2003,7 @@ def reorder_levels(self, order):
20032003
result.index = result.index.reorder_levels(order)
20042004
return result
20052005

2006-
def unstack(self, level=-1):
2006+
def unstack(self, level=-1, fill_value=None):
20072007
"""
20082008
Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame.
20092009
The level involved will automatically get sorted.
@@ -2012,6 +2012,10 @@ def unstack(self, level=-1):
20122012
----------
20132013
level : int, string, or list of these, default last level
20142014
Level(s) to unstack, can pass level name
2015+
fill_value : replace NaN with this value if the unstack produces
2016+
missing values
2017+
2018+
.. versionadded: 0.18.0
20152019
20162020
Examples
20172021
--------
@@ -2036,7 +2040,7 @@ def unstack(self, level=-1):
20362040
unstacked : DataFrame
20372041
"""
20382042
from pandas.core.reshape import unstack
2039-
return unstack(self, level)
2043+
return unstack(self, level, fill_value)
20402044

20412045
# ----------------------------------------------------------------------
20422046
# function application

pandas/tests/frame/test_reshape.py

+136-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import numpy as np
1111

1212
from pandas.compat import u
13-
from pandas import DataFrame, Index, Series, MultiIndex, date_range
13+
from pandas import DataFrame, Index, Series, MultiIndex, date_range, Timedelta, Period
1414
import pandas as pd
1515

1616
from pandas.util.testing import (assert_series_equal,
@@ -136,6 +136,141 @@ def test_stack_unstack(self):
136136
assert_frame_equal(unstacked_cols.T, self.frame)
137137
assert_frame_equal(unstacked_cols_df['bar'].T, self.frame)
138138

139+
def test_unstack_fill(self):
140+
141+
# GH #9746: fill_value keyword argument for Series
142+
# and DataFrame unstack
143+
144+
# From a series
145+
data = Series([1, 2, 4, 5], dtype=np.int16)
146+
data.index = MultiIndex.from_tuples(
147+
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
148+
149+
result = data.unstack(fill_value=-1)
150+
expected = DataFrame({'a': [1, -1, 5], 'b': [2, 4, -1]},
151+
index=['x', 'y', 'z'], dtype=np.int16)
152+
assert_frame_equal(result, expected)
153+
154+
# From a series with incorrect data type for fill_value
155+
result = data.unstack(fill_value=0.5)
156+
expected = DataFrame({'a': [1, 0.5, 5], 'b': [2, 4, 0.5]},
157+
index=['x', 'y', 'z'], dtype=np.float)
158+
assert_frame_equal(result, expected)
159+
160+
# From a dataframe
161+
rows = [[1, 2], [3, 4], [5, 6], [7, 8]]
162+
df = DataFrame(rows, columns=list('AB'), dtype=np.int32)
163+
df.index = MultiIndex.from_tuples(
164+
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
165+
166+
result = df.unstack(fill_value=-1)
167+
168+
rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]]
169+
expected = DataFrame(rows, index=list('xyz'), dtype=np.int32)
170+
expected.columns = MultiIndex.from_tuples(
171+
[('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
172+
assert_frame_equal(result, expected)
173+
174+
# From a mixed type dataframe
175+
df['A'] = df['A'].astype(np.int16)
176+
df['B'] = df['B'].astype(np.float64)
177+
178+
result = df.unstack(fill_value=-1)
179+
expected['A'] = expected['A'].astype(np.int16)
180+
expected['B'] = expected['B'].astype(np.float64)
181+
assert_frame_equal(result, expected)
182+
183+
# From a dataframe with incorrect data type for fill_value
184+
result = df.unstack(fill_value=0.5)
185+
186+
rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]]
187+
expected = DataFrame(rows, index=list('xyz'), dtype=np.float)
188+
expected.columns = MultiIndex.from_tuples(
189+
[('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
190+
assert_frame_equal(result, expected)
191+
192+
# Test unstacking with date times
193+
dv = pd.date_range('2012-01-01', periods=4).values
194+
data = Series(dv)
195+
data.index = MultiIndex.from_tuples(
196+
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
197+
198+
result = data.unstack()
199+
expected = DataFrame({'a': [dv[0], pd.NaT, dv[3]],
200+
'b': [dv[1], dv[2], pd.NaT]},
201+
index=['x', 'y', 'z'])
202+
assert_frame_equal(result, expected)
203+
204+
result = data.unstack(fill_value=dv[0])
205+
expected = DataFrame({'a': [dv[0], dv[0], dv[3]],
206+
'b': [dv[1], dv[2], dv[0]]},
207+
index=['x', 'y', 'z'])
208+
assert_frame_equal(result, expected)
209+
210+
# Test unstacking with time deltas
211+
td = [Timedelta(days=i) for i in range(4)]
212+
data = Series(td)
213+
data.index = MultiIndex.from_tuples(
214+
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
215+
216+
result = data.unstack()
217+
expected = DataFrame({'a': [td[0], pd.NaT, td[3]],
218+
'b': [td[1], td[2], pd.NaT]},
219+
index=['x', 'y', 'z'])
220+
assert_frame_equal(result, expected)
221+
222+
result = data.unstack(fill_value=td[1])
223+
expected = DataFrame({'a': [td[0], td[1], td[3]],
224+
'b': [td[1], td[2], td[1]]},
225+
index=['x', 'y', 'z'])
226+
assert_frame_equal(result, expected)
227+
228+
# Test unstacking with period
229+
periods = [Period('2012-01'), Period('2012-02'), Period('2012-03'),
230+
Period('2012-04')]
231+
data = Series(periods)
232+
data.index = MultiIndex.from_tuples(
233+
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
234+
235+
result = data.unstack()
236+
expected = DataFrame({'a': [periods[0], None, periods[3]],
237+
'b': [periods[1], periods[2], None]},
238+
index=['x', 'y', 'z'])
239+
assert_frame_equal(result, expected)
240+
241+
result = data.unstack(fill_value=periods[1])
242+
expected = DataFrame({'a': [periods[0], periods[1], periods[3]],
243+
'b': [periods[1], periods[2], periods[1]]},
244+
index=['x', 'y', 'z'])
245+
assert_frame_equal(result, expected)
246+
247+
# Test unstacking with categorical
248+
data = pd.Series(['a', 'b', 'c', 'a'], dtype='category')
249+
data.index = pd.MultiIndex.from_tuples(
250+
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
251+
252+
# By default missing values will be NaN
253+
result = data.unstack()
254+
expected = DataFrame({'a': pd.Categorical(list('axa'),
255+
categories=list('abc')),
256+
'b': pd.Categorical(list('bcx'),
257+
categories=list('abc'))},
258+
index=list('xyz'))
259+
assert_frame_equal(result, expected)
260+
261+
# Fill with non-category results in NaN entries similar to above
262+
result = data.unstack(fill_value='d')
263+
assert_frame_equal(result, expected)
264+
265+
# Fill with category value replaces missing values as expected
266+
result = data.unstack(fill_value='c')
267+
expected = DataFrame({'a': pd.Categorical(list('aca'),
268+
categories=list('abc')),
269+
'b': pd.Categorical(list('bcc'),
270+
categories=list('abc'))},
271+
index=list('xyz'))
272+
assert_frame_equal(result, expected)
273+
139274
def test_stack_ints(self):
140275
df = DataFrame(
141276
np.random.randn(30, 27),

0 commit comments

Comments
 (0)