Skip to content

BUG: multiple level unstack with nulls #9517

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 5, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.16.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ Bug Fixes
- Fixed bug on bug endian platforms which produced incorrect results in ``StataReader`` (:issue:`8688`).

- Bug in ``MultiIndex.has_duplicates`` when having many levels causes an indexer overflow (:issue:`9075`, :issue:`5873`)
- Bug in ``pivot`` and ``unstack`` where ``nan`` values would break index alignment (:issue:`4862`, :issue:`7401`, :issue:`7403`, :issue:`7405`, :issue:`7466`)
- Bug in ``pivot`` and ``unstack`` where ``nan`` values would break index alignment (:issue:`4862`, :issue:`7401`, :issue:`7403`, :issue:`7405`, :issue:`7466`, :issue:`9497`)
- Bug in left ``join`` on multi-index with ``sort=True`` or null values (:issue:`9210`).
- Bug in ``MultiIndex`` where inserting new keys would fail (:issue:`9250`).
- Bug in ``groupby`` when key space exceeds ``int64`` bounds (:issue:`9096`).
Expand Down
22 changes: 18 additions & 4 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1382,7 +1382,8 @@ def ngroups(self):
def recons_labels(self):
comp_ids, obs_ids, _ = self.group_info
labels = (ping.labels for ping in self.groupings)
return decons_obs_group_ids(comp_ids, obs_ids, self.shape, labels)
return decons_obs_group_ids(comp_ids,
obs_ids, self.shape, labels, xnull=True)

@cache_readonly
def result_index(self):
Expand Down Expand Up @@ -3570,13 +3571,26 @@ def decons_group_index(comp_labels, shape):
return label_list[::-1]


def decons_obs_group_ids(comp_ids, obs_ids, shape, labels):
"""reconstruct labels from observed ids"""
def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull):
"""
reconstruct labels from observed group ids

Parameters
----------
xnull: boolean,
if nulls are excluded; i.e. -1 labels are passed through
"""
from pandas.hashtable import unique_label_indices

if not xnull:
lift = np.fromiter(((a == -1).any() for a in labels), dtype='i8')
shape = np.asarray(shape, dtype='i8') + lift

if not _int64_overflow_possible(shape):
# obs ids are deconstructable! take the fast route!
return decons_group_index(obs_ids, shape)
out = decons_group_index(obs_ids, shape)
return out if xnull or not lift.any() \
else [x - y for x, y in zip(out, lift)]

i = unique_label_indices(comp_ids)
i8copy = lambda a: a.astype('i8', subok=False, copy=True)
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,8 @@ def _unstack_multiple(data, clocs):
group_index = get_group_index(clabels, shape, sort=False, xnull=False)

comp_ids, obs_ids = _compress_group_index(group_index, sort=False)
recons_labels = decons_obs_group_ids(comp_ids, obs_ids, shape, clabels)
recons_labels = decons_obs_group_ids(comp_ids,
obs_ids, shape, clabels, xnull=False)

dummy_index = MultiIndex(levels=rlevels + [obs_ids],
labels=rlabels + [comp_ids],
Expand Down
18 changes: 18 additions & 0 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -12494,6 +12494,24 @@ def verify(df):
left = df.ix[17264:].copy().set_index(['s_id','dosage','agent'])
assert_frame_equal(left.unstack(), right)

# GH9497 - multiple unstack with nulls
df = DataFrame({'1st':[1, 2, 1, 2, 1, 2],
'2nd':pd.date_range('2014-02-01', periods=6, freq='D'),
'jim':100 + np.arange(6),
'joe':(np.random.randn(6) * 10).round(2)})

df['3rd'] = df['2nd'] - pd.Timestamp('2014-02-02')
df.loc[1, '2nd'] = df.loc[3, '2nd'] = nan
df.loc[1, '3rd'] = df.loc[4, '3rd'] = nan

left = df.set_index(['1st', '2nd', '3rd']).unstack(['2nd', '3rd'])
self.assertEqual(left.notnull().values.sum(), 2 * len(df))

for col in ['jim', 'joe']:
for _, r in df.iterrows():
key = r['1st'], (col, r['2nd'], r['3rd'])
self.assertEqual(r[col], left.loc[key])

def test_stack_datetime_column_multiIndex(self):
# GH 8039
t = datetime(2014, 1, 1)
Expand Down