Skip to content

Commit fd36e6e

Browse files
committed
multiple level unstack with nulls
1 parent bceb342 commit fd36e6e

File tree

4 files changed

+39
-6
lines changed

4 files changed

+39
-6
lines changed

doc/source/whatsnew/v0.16.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ Bug Fixes
235235
- Fixed bug on bug endian platforms which produced incorrect results in ``StataReader`` (:issue:`8688`).
236236

237237
- Bug in ``MultiIndex.has_duplicates`` when having many levels causes an indexer overflow (:issue:`9075`, :issue:`5873`)
238-
- Bug in ``pivot`` and ``unstack`` where ``nan`` values would break index alignment (:issue:`4862`, :issue:`7401`, :issue:`7403`, :issue:`7405`, :issue:`7466`)
238+
- Bug in ``pivot`` and ``unstack`` where ``nan`` values would break index alignment (:issue:`4862`, :issue:`7401`, :issue:`7403`, :issue:`7405`, :issue:`7466`, :issue:`9497`)
239239
- Bug in left ``join`` on multi-index with ``sort=True`` or null values (:issue:`9210`).
240240
- Bug in ``MultiIndex`` where inserting new keys would fail (:issue:`9250`).
241241
- Bug in ``groupby`` when key space exceeds ``int64`` bounds (:issue:`9096`).

pandas/core/groupby.py

+18-4
Original file line numberDiff line numberDiff line change
@@ -1382,7 +1382,8 @@ def ngroups(self):
13821382
def recons_labels(self):
13831383
comp_ids, obs_ids, _ = self.group_info
13841384
labels = (ping.labels for ping in self.groupings)
1385-
return decons_obs_group_ids(comp_ids, obs_ids, self.shape, labels)
1385+
return decons_obs_group_ids(comp_ids,
1386+
obs_ids, self.shape, labels, xnull=True)
13861387

13871388
@cache_readonly
13881389
def result_index(self):
@@ -3570,13 +3571,26 @@ def decons_group_index(comp_labels, shape):
35703571
return label_list[::-1]
35713572

35723573

3573-
def decons_obs_group_ids(comp_ids, obs_ids, shape, labels):
3574-
"""reconstruct labels from observed ids"""
3574+
def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull):
3575+
"""
3576+
reconstruct labels from observed group ids
3577+
3578+
Parameters
3579+
----------
3580+
xnull: boolean,
3581+
if nulls are excluded; i.e. -1 labels are passed through
3582+
"""
35753583
from pandas.hashtable import unique_label_indices
35763584

3585+
if not xnull:
3586+
lift = np.fromiter(((a == -1).any() for a in labels), dtype='i8')
3587+
shape = np.asarray(shape, dtype='i8') + lift
3588+
35773589
if not _int64_overflow_possible(shape):
35783590
# obs ids are deconstructable! take the fast route!
3579-
return decons_group_index(obs_ids, shape)
3591+
out = decons_group_index(obs_ids, shape)
3592+
return out if xnull or not lift.any() \
3593+
else [x - y for x, y in zip(out, lift)]
35803594

35813595
i = unique_label_indices(comp_ids)
35823596
i8copy = lambda a: a.astype('i8', subok=False, copy=True)

pandas/core/reshape.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,8 @@ def _unstack_multiple(data, clocs):
260260
group_index = get_group_index(clabels, shape, sort=False, xnull=False)
261261

262262
comp_ids, obs_ids = _compress_group_index(group_index, sort=False)
263-
recons_labels = decons_obs_group_ids(comp_ids, obs_ids, shape, clabels)
263+
recons_labels = decons_obs_group_ids(comp_ids,
264+
obs_ids, shape, clabels, xnull=False)
264265

265266
dummy_index = MultiIndex(levels=rlevels + [obs_ids],
266267
labels=rlabels + [comp_ids],

pandas/tests/test_frame.py

+18
Original file line numberDiff line numberDiff line change
@@ -12494,6 +12494,24 @@ def verify(df):
1249412494
left = df.ix[17264:].copy().set_index(['s_id','dosage','agent'])
1249512495
assert_frame_equal(left.unstack(), right)
1249612496

12497+
# GH9497 - multiple unstack with nulls
12498+
df = DataFrame({'1st':[1, 2, 1, 2, 1, 2],
12499+
'2nd':pd.date_range('2014-02-01', periods=6, freq='D'),
12500+
'jim':100 + np.arange(6),
12501+
'joe':(np.random.randn(6) * 10).round(2)})
12502+
12503+
df['3rd'] = df['2nd'] - pd.Timestamp('2014-02-02')
12504+
df.loc[1, '2nd'] = df.loc[3, '2nd'] = nan
12505+
df.loc[1, '3rd'] = df.loc[4, '3rd'] = nan
12506+
12507+
left = df.set_index(['1st', '2nd', '3rd']).unstack(['2nd', '3rd'])
12508+
self.assertEqual(left.notnull().values.sum(), 2 * len(df))
12509+
12510+
for col in ['jim', 'joe']:
12511+
for _, r in df.iterrows():
12512+
key = r['1st'], (col, r['2nd'], r['3rd'])
12513+
self.assertEqual(r[col], left.loc[key])
12514+
1249712515
def test_stack_datetime_column_multiIndex(self):
1249812516
# GH 8039
1249912517
t = datetime(2014, 1, 1)

0 commit comments

Comments
 (0)