Merge pull request #9517 from behzadnouri/unstack-multiple

jreback · jreback · commit f8fd05db03b6 · 2015-03-05T18:25:33.000-05:00
BUG: multiple level unstack with nulls
diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt
@@ -434,7 +434,7 @@ Bug Fixes
 - Fixed bug on bug endian platforms which produced incorrect results in ``StataReader`` (:issue:`8688`).
 
 - Bug in ``MultiIndex.has_duplicates`` when having many levels causes an indexer overflow (:issue:`9075`, :issue:`5873`)
-- Bug in ``pivot`` and ``unstack`` where ``nan`` values would break index alignment (:issue:`4862`, :issue:`7401`, :issue:`7403`, :issue:`7405`, :issue:`7466`)
+- Bug in ``pivot`` and ``unstack`` where ``nan`` values would break index alignment (:issue:`4862`, :issue:`7401`, :issue:`7403`, :issue:`7405`, :issue:`7466`, :issue:`9497`)
 - Bug in left ``join`` on multi-index with ``sort=True`` or null values (:issue:`9210`).
 - Bug in ``MultiIndex`` where inserting new keys would fail (:issue:`9250`).
 - Bug in ``groupby`` when key space exceeds ``int64`` bounds (:issue:`9096`).
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -1379,7 +1379,8 @@ def ngroups(self):
     def recons_labels(self):
         comp_ids, obs_ids, _ = self.group_info
         labels = (ping.labels for ping in self.groupings)
-        return decons_obs_group_ids(comp_ids, obs_ids, self.shape, labels)
+        return decons_obs_group_ids(comp_ids,
+                obs_ids, self.shape, labels, xnull=True)
 
     @cache_readonly
     def result_index(self):
@@ -3567,13 +3568,26 @@ def decons_group_index(comp_labels, shape):
     return label_list[::-1]
 
 
-def decons_obs_group_ids(comp_ids, obs_ids, shape, labels):
-    """reconstruct labels from observed ids"""
+def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull):
+    """
+    reconstruct labels from observed group ids
+
+    Parameters
+    ----------
+    xnull: boolean,
+        if nulls are excluded; i.e. -1 labels are passed through
+    """
     from pandas.hashtable import unique_label_indices
 
+    if not xnull:
+        lift = np.fromiter(((a == -1).any() for a in labels), dtype='i8')
+        shape = np.asarray(shape, dtype='i8') + lift
+
     if not _int64_overflow_possible(shape):
         # obs ids are deconstructable! take the fast route!
-        return decons_group_index(obs_ids, shape)
+        out = decons_group_index(obs_ids, shape)
+        return out if xnull or not lift.any() \
+                else [x - y for x, y in zip(out, lift)]
 
     i = unique_label_indices(comp_ids)
     i8copy = lambda a: a.astype('i8', subok=False, copy=True)
diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py
@@ -260,7 +260,8 @@ def _unstack_multiple(data, clocs):
     group_index = get_group_index(clabels, shape, sort=False, xnull=False)
 
     comp_ids, obs_ids = _compress_group_index(group_index, sort=False)
-    recons_labels = decons_obs_group_ids(comp_ids, obs_ids, shape, clabels)
+    recons_labels = decons_obs_group_ids(comp_ids,
+                       obs_ids, shape, clabels, xnull=False)
 
     dummy_index = MultiIndex(levels=rlevels + [obs_ids],
                              labels=rlabels + [comp_ids],
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -12522,6 +12522,24 @@ def verify(df):
         left = df.ix[17264:].copy().set_index(['s_id','dosage','agent'])
         assert_frame_equal(left.unstack(), right)
 
+        # GH9497 - multiple unstack with nulls
+        df = DataFrame({'1st':[1, 2, 1, 2, 1, 2],
+                        '2nd':pd.date_range('2014-02-01', periods=6, freq='D'),
+                        'jim':100 + np.arange(6),
+                        'joe':(np.random.randn(6) * 10).round(2)})
+
+        df['3rd'] = df['2nd'] - pd.Timestamp('2014-02-02')
+        df.loc[1, '2nd'] = df.loc[3, '2nd'] = nan
+        df.loc[1, '3rd'] = df.loc[4, '3rd'] = nan
+
+        left = df.set_index(['1st', '2nd', '3rd']).unstack(['2nd', '3rd'])
+        self.assertEqual(left.notnull().values.sum(), 2 * len(df))
+
+        for col in ['jim', 'joe']:
+           for _, r in df.iterrows():
+               key = r['1st'], (col, r['2nd'], r['3rd'])
+               self.assertEqual(r[col], left.loc[key])
+
     def test_stack_datetime_column_multiIndex(self):
         # GH 8039
         t = datetime(2014, 1, 1)