diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index 0468c220bcb98..fc30e24588d15 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -235,7 +235,7 @@ Bug Fixes - Fixed bug on bug endian platforms which produced incorrect results in ``StataReader`` (:issue:`8688`). - Bug in ``MultiIndex.has_duplicates`` when having many levels causes an indexer overflow (:issue:`9075`, :issue:`5873`) -- Bug in ``pivot`` and ``unstack`` where ``nan`` values would break index alignment (:issue:`4862`, :issue:`7401`, :issue:`7403`, :issue:`7405`, :issue:`7466`) +- Bug in ``pivot`` and ``unstack`` where ``nan`` values would break index alignment (:issue:`4862`, :issue:`7401`, :issue:`7403`, :issue:`7405`, :issue:`7466`, :issue:`9497`) - Bug in left ``join`` on multi-index with ``sort=True`` or null values (:issue:`9210`). - Bug in ``MultiIndex`` where inserting new keys would fail (:issue:`9250`). - Bug in ``groupby`` when key space exceeds ``int64`` bounds (:issue:`9096`). diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 0be046bbdec42..635edaf60a8b7 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1382,7 +1382,8 @@ def ngroups(self): def recons_labels(self): comp_ids, obs_ids, _ = self.group_info labels = (ping.labels for ping in self.groupings) - return decons_obs_group_ids(comp_ids, obs_ids, self.shape, labels) + return decons_obs_group_ids(comp_ids, + obs_ids, self.shape, labels, xnull=True) @cache_readonly def result_index(self): @@ -3570,13 +3571,26 @@ def decons_group_index(comp_labels, shape): return label_list[::-1] -def decons_obs_group_ids(comp_ids, obs_ids, shape, labels): - """reconstruct labels from observed ids""" +def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull): + """ + reconstruct labels from observed group ids + + Parameters + ---------- + xnull: boolean, + if nulls are excluded; i.e. -1 labels are passed through + """ from pandas.hashtable import unique_label_indices + if not xnull: + lift = np.fromiter(((a == -1).any() for a in labels), dtype='i8') + shape = np.asarray(shape, dtype='i8') + lift + if not _int64_overflow_possible(shape): # obs ids are deconstructable! take the fast route! - return decons_group_index(obs_ids, shape) + out = decons_group_index(obs_ids, shape) + return out if xnull or not lift.any() \ + else [x - y for x, y in zip(out, lift)] i = unique_label_indices(comp_ids) i8copy = lambda a: a.astype('i8', subok=False, copy=True) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 6eb46de11210a..aa5877e656a52 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -260,7 +260,8 @@ def _unstack_multiple(data, clocs): group_index = get_group_index(clabels, shape, sort=False, xnull=False) comp_ids, obs_ids = _compress_group_index(group_index, sort=False) - recons_labels = decons_obs_group_ids(comp_ids, obs_ids, shape, clabels) + recons_labels = decons_obs_group_ids(comp_ids, + obs_ids, shape, clabels, xnull=False) dummy_index = MultiIndex(levels=rlevels + [obs_ids], labels=rlabels + [comp_ids], diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 66e008aa16b3e..86b7dfef79618 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -12494,6 +12494,24 @@ def verify(df): left = df.ix[17264:].copy().set_index(['s_id','dosage','agent']) assert_frame_equal(left.unstack(), right) + # GH9497 - multiple unstack with nulls + df = DataFrame({'1st':[1, 2, 1, 2, 1, 2], + '2nd':pd.date_range('2014-02-01', periods=6, freq='D'), + 'jim':100 + np.arange(6), + 'joe':(np.random.randn(6) * 10).round(2)}) + + df['3rd'] = df['2nd'] - pd.Timestamp('2014-02-02') + df.loc[1, '2nd'] = df.loc[3, '2nd'] = nan + df.loc[1, '3rd'] = df.loc[4, '3rd'] = nan + + left = df.set_index(['1st', '2nd', '3rd']).unstack(['2nd', '3rd']) + self.assertEqual(left.notnull().values.sum(), 2 * len(df)) + + for col in ['jim', 'joe']: + for _, r in df.iterrows(): + key = r['1st'], (col, r['2nd'], r['3rd']) + self.assertEqual(r[col], left.loc[key]) + def test_stack_datetime_column_multiIndex(self): # GH 8039 t = datetime(2014, 1, 1)