Skip to content

pivot & unstack with nan in the index #9061

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 22, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.16.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ Bug Fixes


- Bug in ``MultiIndex.has_duplicates`` when having many levels causes an indexer overflow (:issue:`9075`)
- Bug in ``pivot`` and `unstack`` where ``nan`` values would break index alignment (:issue:`7466`)



Expand Down
57 changes: 56 additions & 1 deletion pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from pandas.compat import(
zip, builtins, range, long, lzip,
OrderedDict, callable
OrderedDict, callable, filter, map
)
from pandas import compat

Expand Down Expand Up @@ -3510,6 +3510,61 @@ def get_group_index(label_list, shape):
np.putmask(group_index, mask, -1)
return group_index


def get_flat_ids(labels, shape, retain_lex_rank):
"""
Given a list of labels at each level, returns a flat array of int64 ids
corresponding to unique tuples across the labels. If `retain_lex_rank`,
rank of returned ids preserve lexical ranks of labels.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a Parameters/Returns block


Parameters
----------
labels: sequence of arrays
Integers identifying levels at each location
shape: sequence of ints same length as labels
Number of unique levels at each location
retain_lex_rank: boolean
If the ranks of returned ids should match lexical ranks of labels

Returns
-------
An array of type int64 where two elements are equal if their corresponding
labels are equal at all location.
"""
def loop(labels, shape):
# how many levels can be done without overflow:
pred = lambda i: not _int64_overflow_possible(shape[:i])
nlev = next(filter(pred, range(len(shape), 0, -1)))

# compute flat ids for the first `nlev` levels
stride = np.prod(shape[1:nlev], dtype='i8')
out = stride * labels[0].astype('i8', subok=False, copy=False)

for i in range(1, nlev):
stride //= shape[i]
out += labels[i] * stride

if nlev == len(shape): # all levels done!
return out

# compress what has been done so far in order to avoid overflow
# to retain lexical ranks, obs_ids should be sorted
comp_ids, obs_ids = _compress_group_index(out, sort=retain_lex_rank)

labels = [comp_ids] + labels[nlev:]
shape = [len(obs_ids)] + shape[nlev:]

return loop(labels, shape)

def maybe_lift(lab, size): # pormote nan values
return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)

labels = map(com._ensure_int64, labels)
labels, shape = map(list, zip(*map(maybe_lift, labels, shape)))

return loop(labels, shape)


_INT64_MAX = np.iinfo(np.int64).max


Expand Down
39 changes: 4 additions & 35 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -3226,44 +3226,13 @@ def _has_complex_internals(self):
@cache_readonly
def is_unique(self):
from pandas.hashtable import Int64HashTable

def _get_group_index(labels, shape):
from pandas.core.groupby import _int64_overflow_possible, \
_compress_group_index

# how many levels can be done without overflow
pred = lambda i: not _int64_overflow_possible(shape[:i])
nlev = next(filter(pred, range(len(shape), 0, -1)))

# compute group indicies for the first `nlev` levels
group_index = labels[0].astype('i8', subok=False, copy=True)
stride = shape[0]

for i in range(1, nlev):
group_index += labels[i] * stride
stride *= shape[i]

if nlev == len(shape):
return group_index

comp_ids, obs_ids = _compress_group_index(group_index, sort=False)

labels = [comp_ids] + labels[nlev:]
shape = [len(obs_ids)] + shape[nlev:]

return _get_group_index(labels, shape)

def _maybe_lift(lab, size): # pormote nan values
return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)
from pandas.core.groupby import get_flat_ids

shape = map(len, self.levels)
labels = map(_ensure_int64, self.labels)

labels, shape = map(list, zip(*map(_maybe_lift, labels, shape)))
group_index = _get_group_index(labels, shape)
ids = get_flat_ids(self.labels, shape, False)
table = Int64HashTable(min(1 << 20, len(ids)))

table = Int64HashTable(min(1 << 20, len(group_index)))
return len(table.unique(group_index)) == len(self)
return len(table.unique(ids)) == len(self)

def get_value(self, series, key):
# somewhat broken encapsulation
Expand Down
123 changes: 32 additions & 91 deletions pandas/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,18 +82,10 @@ def __init__(self, values, index, level=-1, value_columns=None):

self.level = self.index._get_level_number(level)

levels = index.levels
labels = index.labels

def _make_index(lev, lab):
values = _make_index_array_level(lev.values, lab)
i = lev._simple_new(values, lev.name,
freq=getattr(lev, 'freq', None),
tz=getattr(lev, 'tz', None))
return i

self.new_index_levels = [_make_index(lev, lab)
for lev, lab in zip(levels, labels)]
# when index includes `nan`, need to lift levels/strides by 1
self.lift = 1 if -1 in self.index.labels[self.level] else 0

self.new_index_levels = list(index.levels)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

perhaps use .copy() here rather than list. ?

self.new_index_names = list(index.names)

self.removed_name = self.new_index_names.pop(self.level)
Expand Down Expand Up @@ -134,10 +126,10 @@ def _make_selectors(self):
ngroups = len(obs_ids)

comp_index = _ensure_platform_int(comp_index)
stride = self.index.levshape[self.level]
stride = self.index.levshape[self.level] + self.lift
self.full_shape = ngroups, stride

selector = self.sorted_labels[-1] + stride * comp_index
selector = self.sorted_labels[-1] + stride * comp_index + self.lift
mask = np.zeros(np.prod(self.full_shape), dtype=bool)
mask.put(selector, True)

Expand Down Expand Up @@ -166,20 +158,6 @@ def get_result(self):
values = com.take_nd(values, inds, axis=1)
columns = columns[inds]

# we might have a missing index
if len(index) != values.shape[0]:
mask = isnull(index)
if mask.any():
l = np.arange(len(index))
values, orig_values = (np.empty((len(index), values.shape[1])),
values)
values.fill(np.nan)
values_indexer = com._ensure_int64(l[~mask])
for i, j in enumerate(values_indexer):
values[j] = orig_values[i]
else:
index = index.take(self.unique_groups)

# may need to coerce categoricals here
if self.is_categorical is not None:
values = [ Categorical.from_array(values[:,i],
Expand Down Expand Up @@ -220,9 +198,16 @@ def get_new_values(self):

def get_new_columns(self):
if self.value_columns is None:
return self.removed_level
if self.lift == 0:
return self.removed_level

lev = self.removed_level
vals = np.insert(lev.astype('object'), 0,
_get_na_value(lev.dtype.type))

return lev._shallow_copy(vals)

stride = len(self.removed_level)
stride = len(self.removed_level) + self.lift
width = len(self.value_columns)
propagator = np.repeat(np.arange(width), stride)
if isinstance(self.value_columns, MultiIndex):
Expand All @@ -231,59 +216,34 @@ def get_new_columns(self):

new_labels = [lab.take(propagator)
for lab in self.value_columns.labels]
new_labels.append(np.tile(np.arange(stride), width))
else:
new_levels = [self.value_columns, self.removed_level]
new_names = [self.value_columns.name, self.removed_name]
new_labels = [propagator]

new_labels = []

new_labels.append(propagator)
new_labels.append(np.tile(np.arange(stride), width))

new_labels.append(np.tile(np.arange(stride) - self.lift, width))
return MultiIndex(levels=new_levels, labels=new_labels,
names=new_names, verify_integrity=False)

def get_new_index(self):
result_labels = []
for cur in self.sorted_labels[:-1]:
labels = cur.take(self.compressor)
labels = _make_index_array_level(labels, cur)
result_labels.append(labels)
result_labels = [lab.take(self.compressor)
for lab in self.sorted_labels[:-1]]

# construct the new index
if len(self.new_index_levels) == 1:
new_index = self.new_index_levels[0]
new_index.name = self.new_index_names[0]
else:
new_index = MultiIndex(levels=self.new_index_levels,
labels=result_labels,
names=self.new_index_names,
verify_integrity=False)

return new_index
lev, lab = self.new_index_levels[0], result_labels[0]
if not (lab == -1).any():
return lev.take(lab)

vals = np.insert(lev.astype('object'), len(lev),
_get_na_value(lev.dtype.type)).take(lab)

def _make_index_array_level(lev, lab):
""" create the combined index array, preserving nans, return an array """
mask = lab == -1
if not mask.any():
return lev

l = np.arange(len(lab))
mask_labels = np.empty(len(mask[mask]), dtype=object)
mask_labels.fill(_get_na_value(lev.dtype.type))
mask_indexer = com._ensure_int64(l[mask])

labels = lev
labels_indexer = com._ensure_int64(l[~mask])

new_labels = np.empty(tuple([len(lab)]), dtype=object)
new_labels[labels_indexer] = labels
new_labels[mask_indexer] = mask_labels

return new_labels
return lev._shallow_copy(vals)

return MultiIndex(levels=self.new_index_levels,
labels=result_labels,
names=self.new_index_names,
verify_integrity=False)

def _unstack_multiple(data, clocs):
if len(clocs) == 0:
Expand Down Expand Up @@ -483,29 +443,10 @@ def _unstack_frame(obj, level):


def get_compressed_ids(labels, sizes):
# no overflow
if com._long_prod(sizes) < 2 ** 63:
group_index = get_group_index(labels, sizes)
comp_index, obs_ids = _compress_group_index(group_index)
else:
n = len(labels[0])
mask = np.zeros(n, dtype=bool)
for v in labels:
mask |= v < 0

while com._long_prod(sizes) >= 2 ** 63:
i = len(sizes)
while com._long_prod(sizes[:i]) >= 2 ** 63:
i -= 1

rem_index, rem_ids = get_compressed_ids(labels[:i],
sizes[:i])
sizes = [len(rem_ids)] + sizes[i:]
labels = [rem_index] + labels[i:]

return get_compressed_ids(labels, sizes)
from pandas.core.groupby import get_flat_ids

return comp_index, obs_ids
ids = get_flat_ids(labels, sizes, True)
return _compress_group_index(ids, sort=True)


def stack(frame, level=-1, dropna=True):
Expand Down
49 changes: 48 additions & 1 deletion pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import nose
import functools
import itertools
from itertools import product
from itertools import product, permutations
from distutils.version import LooseVersion

from pandas.compat import(
Expand Down Expand Up @@ -12334,6 +12334,53 @@ def test_unstack_non_unique_index_names(self):
with tm.assertRaises(ValueError):
df.T.stack('c1')

def test_unstack_nan_index(self): # GH7466
cast = lambda val: '{0:1}'.format('' if val != val else val)
nan = np.nan

def verify(df):
mk_list = lambda a: list(a) if isinstance(a, tuple) else [a]
rows, cols = df.notnull().values.nonzero()
for i, j in zip(rows, cols):
left = sorted(df.iloc[i, j].split('.'))
right = mk_list(df.index[i]) + mk_list(df.columns[j])
right = sorted(list(map(cast, right)))
self.assertEqual(left, right)

df = DataFrame({'jim':['a', 'b', nan, 'd'],
'joe':['w', 'x', 'y', 'z'],
'jolie':['a.w', 'b.x', ' .y', 'd.z']})
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pls give a tests for multiple nans in a single level and a single nan but in multiple levels. (which you prob just raise a ValueError as it cannot be computed).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually I see you do handle this. BUT, the problem is the resulting index are full of nan levels. Hmm. This should be a warning or maybe an option to unstack/pivot. You get a useful result, but then indexing will blow up.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the problem is the resulting index are full of nan levels

@jreback no! there will be no nan in the levels and there is no nan level. see the last lines in the very first comment, or check df.unstack().index.levels. nan's are handled by labels. also, the point of running benchmarks is that this way of handling nan's does not impact performance.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it is not a question of performance but of subsequent indexing. Having nan in the levels is currently restricted to a single nan for indexing. They make the indexes non-unique by definition and when you try to look up the locations they will fail. You can only do positional type indexing. So you leave the user with an odd structure that only partially works.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

my point was that this is handling nan without polluting the levels or impacting performance.

comments addressed, and all came green.


left = df.set_index(['jim', 'joe']).unstack()['jolie']
right = df.set_index(['joe', 'jim']).unstack()['jolie'].T
assert_frame_equal(left, right)

for idx in permutations(df.columns[:2]):
mi = df.set_index(list(idx))
for lev in range(2):
udf = mi.unstack(level=lev)
self.assertEqual(udf.notnull().values.sum(), len(df))
verify(udf['jolie'])

df = DataFrame({'1st':['d'] * 3 + [nan] * 5 + ['a'] * 2 +
['c'] * 3 + ['e'] * 2 + ['b'] * 5,
'2nd':['y'] * 2 + ['w'] * 3 + [nan] * 3 +
['z'] * 4 + [nan] * 3 + ['x'] * 3 + [nan] * 2,
'3rd':[67,39,53,72,57,80,31,18,11,30,59,
50,62,59,76,52,14,53,60,51]})

df['4th'], df['5th'] = \
df.apply(lambda r: '.'.join(map(cast, r)), axis=1), \
df.apply(lambda r: '.'.join(map(cast, r.iloc[::-1])), axis=1)

for idx in permutations(['1st', '2nd', '3rd']):
mi = df.set_index(list(idx))
for lev in range(3):
udf = mi.unstack(level=lev)
self.assertEqual(udf.notnull().values.sum(), 2 * len(df))
for col in ['4th', '5th']:
verify(udf[col])

def test_stack_datetime_column_multiIndex(self):
# GH 8039
t = datetime(2014, 1, 1)
Expand Down
Loading