Skip to content

Commit e3afd5b

Browse files
committed
unstack with nan in the index
1 parent 0fe43a6 commit e3afd5b

File tree

6 files changed

+154
-138
lines changed

6 files changed

+154
-138
lines changed

doc/source/whatsnew/v0.16.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ Bug Fixes
6767

6868

6969
- Bug in ``MultiIndex.has_duplicates`` when having many levels causes an indexer overflow (:issue:`9075`)
70+
- Bug in ``pivot`` and `unstack`` where ``nan`` values would break index alignment (:issue:`7466`)
7071

7172

7273

pandas/core/groupby.py

+56-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
from pandas.compat import(
88
zip, builtins, range, long, lzip,
9-
OrderedDict, callable
9+
OrderedDict, callable, filter, map
1010
)
1111
from pandas import compat
1212

@@ -3510,6 +3510,61 @@ def get_group_index(label_list, shape):
35103510
np.putmask(group_index, mask, -1)
35113511
return group_index
35123512

3513+
3514+
def get_flat_ids(labels, shape, retain_lex_rank):
3515+
"""
3516+
Given a list of labels at each level, returns a flat array of int64 ids
3517+
corresponding to unique tuples across the labels. If `retain_lex_rank`,
3518+
rank of returned ids preserve lexical ranks of labels.
3519+
3520+
Parameters
3521+
----------
3522+
labels: sequence of arrays
3523+
Integers identifying levels at each location
3524+
shape: sequence of ints same length as labels
3525+
Number of unique levels at each location
3526+
retain_lex_rank: boolean
3527+
If the ranks of returned ids should match lexical ranks of labels
3528+
3529+
Returns
3530+
-------
3531+
An array of type int64 where two elements are equal if their corresponding
3532+
labels are equal at all location.
3533+
"""
3534+
def loop(labels, shape):
3535+
# how many levels can be done without overflow:
3536+
pred = lambda i: not _int64_overflow_possible(shape[:i])
3537+
nlev = next(filter(pred, range(len(shape), 0, -1)))
3538+
3539+
# compute flat ids for the first `nlev` levels
3540+
stride = np.prod(shape[1:nlev], dtype='i8')
3541+
out = stride * labels[0].astype('i8', subok=False, copy=False)
3542+
3543+
for i in range(1, nlev):
3544+
stride //= shape[i]
3545+
out += labels[i] * stride
3546+
3547+
if nlev == len(shape): # all levels done!
3548+
return out
3549+
3550+
# compress what has been done so far in order to avoid overflow
3551+
# to retain lexical ranks, obs_ids should be sorted
3552+
comp_ids, obs_ids = _compress_group_index(out, sort=retain_lex_rank)
3553+
3554+
labels = [comp_ids] + labels[nlev:]
3555+
shape = [len(obs_ids)] + shape[nlev:]
3556+
3557+
return loop(labels, shape)
3558+
3559+
def maybe_lift(lab, size): # pormote nan values
3560+
return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)
3561+
3562+
labels = map(com._ensure_int64, labels)
3563+
labels, shape = map(list, zip(*map(maybe_lift, labels, shape)))
3564+
3565+
return loop(labels, shape)
3566+
3567+
35133568
_INT64_MAX = np.iinfo(np.int64).max
35143569

35153570

pandas/core/index.py

+4-35
Original file line numberDiff line numberDiff line change
@@ -3226,44 +3226,13 @@ def _has_complex_internals(self):
32263226
@cache_readonly
32273227
def is_unique(self):
32283228
from pandas.hashtable import Int64HashTable
3229-
3230-
def _get_group_index(labels, shape):
3231-
from pandas.core.groupby import _int64_overflow_possible, \
3232-
_compress_group_index
3233-
3234-
# how many levels can be done without overflow
3235-
pred = lambda i: not _int64_overflow_possible(shape[:i])
3236-
nlev = next(filter(pred, range(len(shape), 0, -1)))
3237-
3238-
# compute group indicies for the first `nlev` levels
3239-
group_index = labels[0].astype('i8', subok=False, copy=True)
3240-
stride = shape[0]
3241-
3242-
for i in range(1, nlev):
3243-
group_index += labels[i] * stride
3244-
stride *= shape[i]
3245-
3246-
if nlev == len(shape):
3247-
return group_index
3248-
3249-
comp_ids, obs_ids = _compress_group_index(group_index, sort=False)
3250-
3251-
labels = [comp_ids] + labels[nlev:]
3252-
shape = [len(obs_ids)] + shape[nlev:]
3253-
3254-
return _get_group_index(labels, shape)
3255-
3256-
def _maybe_lift(lab, size): # pormote nan values
3257-
return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)
3229+
from pandas.core.groupby import get_flat_ids
32583230

32593231
shape = map(len, self.levels)
3260-
labels = map(_ensure_int64, self.labels)
3261-
3262-
labels, shape = map(list, zip(*map(_maybe_lift, labels, shape)))
3263-
group_index = _get_group_index(labels, shape)
3232+
ids = get_flat_ids(self.labels, shape, False)
3233+
table = Int64HashTable(min(1 << 20, len(ids)))
32643234

3265-
table = Int64HashTable(min(1 << 20, len(group_index)))
3266-
return len(table.unique(group_index)) == len(self)
3235+
return len(table.unique(ids)) == len(self)
32673236

32683237
def get_value(self, series, key):
32693238
# somewhat broken encapsulation

pandas/core/reshape.py

+32-91
Original file line numberDiff line numberDiff line change
@@ -82,18 +82,10 @@ def __init__(self, values, index, level=-1, value_columns=None):
8282

8383
self.level = self.index._get_level_number(level)
8484

85-
levels = index.levels
86-
labels = index.labels
87-
88-
def _make_index(lev, lab):
89-
values = _make_index_array_level(lev.values, lab)
90-
i = lev._simple_new(values, lev.name,
91-
freq=getattr(lev, 'freq', None),
92-
tz=getattr(lev, 'tz', None))
93-
return i
94-
95-
self.new_index_levels = [_make_index(lev, lab)
96-
for lev, lab in zip(levels, labels)]
85+
# when index includes `nan`, need to lift levels/strides by 1
86+
self.lift = 1 if -1 in self.index.labels[self.level] else 0
87+
88+
self.new_index_levels = list(index.levels)
9789
self.new_index_names = list(index.names)
9890

9991
self.removed_name = self.new_index_names.pop(self.level)
@@ -134,10 +126,10 @@ def _make_selectors(self):
134126
ngroups = len(obs_ids)
135127

136128
comp_index = _ensure_platform_int(comp_index)
137-
stride = self.index.levshape[self.level]
129+
stride = self.index.levshape[self.level] + self.lift
138130
self.full_shape = ngroups, stride
139131

140-
selector = self.sorted_labels[-1] + stride * comp_index
132+
selector = self.sorted_labels[-1] + stride * comp_index + self.lift
141133
mask = np.zeros(np.prod(self.full_shape), dtype=bool)
142134
mask.put(selector, True)
143135

@@ -166,20 +158,6 @@ def get_result(self):
166158
values = com.take_nd(values, inds, axis=1)
167159
columns = columns[inds]
168160

169-
# we might have a missing index
170-
if len(index) != values.shape[0]:
171-
mask = isnull(index)
172-
if mask.any():
173-
l = np.arange(len(index))
174-
values, orig_values = (np.empty((len(index), values.shape[1])),
175-
values)
176-
values.fill(np.nan)
177-
values_indexer = com._ensure_int64(l[~mask])
178-
for i, j in enumerate(values_indexer):
179-
values[j] = orig_values[i]
180-
else:
181-
index = index.take(self.unique_groups)
182-
183161
# may need to coerce categoricals here
184162
if self.is_categorical is not None:
185163
values = [ Categorical.from_array(values[:,i],
@@ -220,9 +198,16 @@ def get_new_values(self):
220198

221199
def get_new_columns(self):
222200
if self.value_columns is None:
223-
return self.removed_level
201+
if self.lift == 0:
202+
return self.removed_level
203+
204+
lev = self.removed_level
205+
vals = np.insert(lev.astype('object'), 0,
206+
_get_na_value(lev.dtype.type))
207+
208+
return lev._shallow_copy(vals)
224209

225-
stride = len(self.removed_level)
210+
stride = len(self.removed_level) + self.lift
226211
width = len(self.value_columns)
227212
propagator = np.repeat(np.arange(width), stride)
228213
if isinstance(self.value_columns, MultiIndex):
@@ -231,59 +216,34 @@ def get_new_columns(self):
231216

232217
new_labels = [lab.take(propagator)
233218
for lab in self.value_columns.labels]
234-
new_labels.append(np.tile(np.arange(stride), width))
235219
else:
236220
new_levels = [self.value_columns, self.removed_level]
237221
new_names = [self.value_columns.name, self.removed_name]
222+
new_labels = [propagator]
238223

239-
new_labels = []
240-
241-
new_labels.append(propagator)
242-
new_labels.append(np.tile(np.arange(stride), width))
243-
224+
new_labels.append(np.tile(np.arange(stride) - self.lift, width))
244225
return MultiIndex(levels=new_levels, labels=new_labels,
245226
names=new_names, verify_integrity=False)
246227

247228
def get_new_index(self):
248-
result_labels = []
249-
for cur in self.sorted_labels[:-1]:
250-
labels = cur.take(self.compressor)
251-
labels = _make_index_array_level(labels, cur)
252-
result_labels.append(labels)
229+
result_labels = [lab.take(self.compressor)
230+
for lab in self.sorted_labels[:-1]]
253231

254232
# construct the new index
255233
if len(self.new_index_levels) == 1:
256-
new_index = self.new_index_levels[0]
257-
new_index.name = self.new_index_names[0]
258-
else:
259-
new_index = MultiIndex(levels=self.new_index_levels,
260-
labels=result_labels,
261-
names=self.new_index_names,
262-
verify_integrity=False)
263-
264-
return new_index
234+
lev, lab = self.new_index_levels[0], result_labels[0]
235+
if not (lab == -1).any():
236+
return lev.take(lab)
265237

238+
vals = np.insert(lev.astype('object'), len(lev),
239+
_get_na_value(lev.dtype.type)).take(lab)
266240

267-
def _make_index_array_level(lev, lab):
268-
""" create the combined index array, preserving nans, return an array """
269-
mask = lab == -1
270-
if not mask.any():
271-
return lev
272-
273-
l = np.arange(len(lab))
274-
mask_labels = np.empty(len(mask[mask]), dtype=object)
275-
mask_labels.fill(_get_na_value(lev.dtype.type))
276-
mask_indexer = com._ensure_int64(l[mask])
277-
278-
labels = lev
279-
labels_indexer = com._ensure_int64(l[~mask])
280-
281-
new_labels = np.empty(tuple([len(lab)]), dtype=object)
282-
new_labels[labels_indexer] = labels
283-
new_labels[mask_indexer] = mask_labels
284-
285-
return new_labels
241+
return lev._shallow_copy(vals)
286242

243+
return MultiIndex(levels=self.new_index_levels,
244+
labels=result_labels,
245+
names=self.new_index_names,
246+
verify_integrity=False)
287247

288248
def _unstack_multiple(data, clocs):
289249
if len(clocs) == 0:
@@ -483,29 +443,10 @@ def _unstack_frame(obj, level):
483443

484444

485445
def get_compressed_ids(labels, sizes):
486-
# no overflow
487-
if com._long_prod(sizes) < 2 ** 63:
488-
group_index = get_group_index(labels, sizes)
489-
comp_index, obs_ids = _compress_group_index(group_index)
490-
else:
491-
n = len(labels[0])
492-
mask = np.zeros(n, dtype=bool)
493-
for v in labels:
494-
mask |= v < 0
495-
496-
while com._long_prod(sizes) >= 2 ** 63:
497-
i = len(sizes)
498-
while com._long_prod(sizes[:i]) >= 2 ** 63:
499-
i -= 1
500-
501-
rem_index, rem_ids = get_compressed_ids(labels[:i],
502-
sizes[:i])
503-
sizes = [len(rem_ids)] + sizes[i:]
504-
labels = [rem_index] + labels[i:]
505-
506-
return get_compressed_ids(labels, sizes)
446+
from pandas.core.groupby import get_flat_ids
507447

508-
return comp_index, obs_ids
448+
ids = get_flat_ids(labels, sizes, True)
449+
return _compress_group_index(ids, sort=True)
509450

510451

511452
def stack(frame, level=-1, dropna=True):

pandas/tests/test_frame.py

+48-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import nose
1212
import functools
1313
import itertools
14-
from itertools import product
14+
from itertools import product, permutations
1515
from distutils.version import LooseVersion
1616

1717
from pandas.compat import(
@@ -12334,6 +12334,53 @@ def test_unstack_non_unique_index_names(self):
1233412334
with tm.assertRaises(ValueError):
1233512335
df.T.stack('c1')
1233612336

12337+
def test_unstack_nan_index(self): # GH7466
12338+
cast = lambda val: '{0:1}'.format('' if val != val else val)
12339+
nan = np.nan
12340+
12341+
def verify(df):
12342+
mk_list = lambda a: list(a) if isinstance(a, tuple) else [a]
12343+
rows, cols = df.notnull().values.nonzero()
12344+
for i, j in zip(rows, cols):
12345+
left = sorted(df.iloc[i, j].split('.'))
12346+
right = mk_list(df.index[i]) + mk_list(df.columns[j])
12347+
right = sorted(list(map(cast, right)))
12348+
self.assertEqual(left, right)
12349+
12350+
df = DataFrame({'jim':['a', 'b', nan, 'd'],
12351+
'joe':['w', 'x', 'y', 'z'],
12352+
'jolie':['a.w', 'b.x', ' .y', 'd.z']})
12353+
12354+
left = df.set_index(['jim', 'joe']).unstack()['jolie']
12355+
right = df.set_index(['joe', 'jim']).unstack()['jolie'].T
12356+
assert_frame_equal(left, right)
12357+
12358+
for idx in permutations(df.columns[:2]):
12359+
mi = df.set_index(list(idx))
12360+
for lev in range(2):
12361+
udf = mi.unstack(level=lev)
12362+
self.assertEqual(udf.notnull().values.sum(), len(df))
12363+
verify(udf['jolie'])
12364+
12365+
df = DataFrame({'1st':['d'] * 3 + [nan] * 5 + ['a'] * 2 +
12366+
['c'] * 3 + ['e'] * 2 + ['b'] * 5,
12367+
'2nd':['y'] * 2 + ['w'] * 3 + [nan] * 3 +
12368+
['z'] * 4 + [nan] * 3 + ['x'] * 3 + [nan] * 2,
12369+
'3rd':[67,39,53,72,57,80,31,18,11,30,59,
12370+
50,62,59,76,52,14,53,60,51]})
12371+
12372+
df['4th'], df['5th'] = \
12373+
df.apply(lambda r: '.'.join(map(cast, r)), axis=1), \
12374+
df.apply(lambda r: '.'.join(map(cast, r.iloc[::-1])), axis=1)
12375+
12376+
for idx in permutations(['1st', '2nd', '3rd']):
12377+
mi = df.set_index(list(idx))
12378+
for lev in range(3):
12379+
udf = mi.unstack(level=lev)
12380+
self.assertEqual(udf.notnull().values.sum(), 2 * len(df))
12381+
for col in ['4th', '5th']:
12382+
verify(udf[col])
12383+
1233712384
def test_stack_datetime_column_multiIndex(self):
1233812385
# GH 8039
1233912386
t = datetime(2014, 1, 1)

0 commit comments

Comments
 (0)