Skip to content

Commit f8fd05d

Browse files
committed
Merge pull request #9517 from behzadnouri/unstack-multiple
BUG: multiple level unstack with nulls
2 parents 827c854 + fd36e6e commit f8fd05d

File tree

4 files changed

+39
-6
lines changed

4 files changed

+39
-6
lines changed

doc/source/whatsnew/v0.16.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -434,7 +434,7 @@ Bug Fixes
434434
- Fixed bug on bug endian platforms which produced incorrect results in ``StataReader`` (:issue:`8688`).
435435

436436
- Bug in ``MultiIndex.has_duplicates`` when having many levels causes an indexer overflow (:issue:`9075`, :issue:`5873`)
437-
- Bug in ``pivot`` and ``unstack`` where ``nan`` values would break index alignment (:issue:`4862`, :issue:`7401`, :issue:`7403`, :issue:`7405`, :issue:`7466`)
437+
- Bug in ``pivot`` and ``unstack`` where ``nan`` values would break index alignment (:issue:`4862`, :issue:`7401`, :issue:`7403`, :issue:`7405`, :issue:`7466`, :issue:`9497`)
438438
- Bug in left ``join`` on multi-index with ``sort=True`` or null values (:issue:`9210`).
439439
- Bug in ``MultiIndex`` where inserting new keys would fail (:issue:`9250`).
440440
- Bug in ``groupby`` when key space exceeds ``int64`` bounds (:issue:`9096`).

pandas/core/groupby.py

+18-4
Original file line numberDiff line numberDiff line change
@@ -1379,7 +1379,8 @@ def ngroups(self):
13791379
def recons_labels(self):
13801380
comp_ids, obs_ids, _ = self.group_info
13811381
labels = (ping.labels for ping in self.groupings)
1382-
return decons_obs_group_ids(comp_ids, obs_ids, self.shape, labels)
1382+
return decons_obs_group_ids(comp_ids,
1383+
obs_ids, self.shape, labels, xnull=True)
13831384

13841385
@cache_readonly
13851386
def result_index(self):
@@ -3567,13 +3568,26 @@ def decons_group_index(comp_labels, shape):
35673568
return label_list[::-1]
35683569

35693570

3570-
def decons_obs_group_ids(comp_ids, obs_ids, shape, labels):
3571-
"""reconstruct labels from observed ids"""
3571+
def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull):
3572+
"""
3573+
reconstruct labels from observed group ids
3574+
3575+
Parameters
3576+
----------
3577+
xnull: boolean,
3578+
if nulls are excluded; i.e. -1 labels are passed through
3579+
"""
35723580
from pandas.hashtable import unique_label_indices
35733581

3582+
if not xnull:
3583+
lift = np.fromiter(((a == -1).any() for a in labels), dtype='i8')
3584+
shape = np.asarray(shape, dtype='i8') + lift
3585+
35743586
if not _int64_overflow_possible(shape):
35753587
# obs ids are deconstructable! take the fast route!
3576-
return decons_group_index(obs_ids, shape)
3588+
out = decons_group_index(obs_ids, shape)
3589+
return out if xnull or not lift.any() \
3590+
else [x - y for x, y in zip(out, lift)]
35773591

35783592
i = unique_label_indices(comp_ids)
35793593
i8copy = lambda a: a.astype('i8', subok=False, copy=True)

pandas/core/reshape.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,8 @@ def _unstack_multiple(data, clocs):
260260
group_index = get_group_index(clabels, shape, sort=False, xnull=False)
261261

262262
comp_ids, obs_ids = _compress_group_index(group_index, sort=False)
263-
recons_labels = decons_obs_group_ids(comp_ids, obs_ids, shape, clabels)
263+
recons_labels = decons_obs_group_ids(comp_ids,
264+
obs_ids, shape, clabels, xnull=False)
264265

265266
dummy_index = MultiIndex(levels=rlevels + [obs_ids],
266267
labels=rlabels + [comp_ids],

pandas/tests/test_frame.py

+18
Original file line numberDiff line numberDiff line change
@@ -12522,6 +12522,24 @@ def verify(df):
1252212522
left = df.ix[17264:].copy().set_index(['s_id','dosage','agent'])
1252312523
assert_frame_equal(left.unstack(), right)
1252412524

12525+
# GH9497 - multiple unstack with nulls
12526+
df = DataFrame({'1st':[1, 2, 1, 2, 1, 2],
12527+
'2nd':pd.date_range('2014-02-01', periods=6, freq='D'),
12528+
'jim':100 + np.arange(6),
12529+
'joe':(np.random.randn(6) * 10).round(2)})
12530+
12531+
df['3rd'] = df['2nd'] - pd.Timestamp('2014-02-02')
12532+
df.loc[1, '2nd'] = df.loc[3, '2nd'] = nan
12533+
df.loc[1, '3rd'] = df.loc[4, '3rd'] = nan
12534+
12535+
left = df.set_index(['1st', '2nd', '3rd']).unstack(['2nd', '3rd'])
12536+
self.assertEqual(left.notnull().values.sum(), 2 * len(df))
12537+
12538+
for col in ['jim', 'joe']:
12539+
for _, r in df.iterrows():
12540+
key = r['1st'], (col, r['2nd'], r['3rd'])
12541+
self.assertEqual(r[col], left.loc[key])
12542+
1252512543
def test_stack_datetime_column_multiIndex(self):
1252612544
# GH 8039
1252712545
t = datetime(2014, 1, 1)

0 commit comments

Comments
 (0)