-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
pivot & unstack with nan in the index #9061
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -82,18 +82,10 @@ def __init__(self, values, index, level=-1, value_columns=None): | |
|
||
self.level = self.index._get_level_number(level) | ||
|
||
levels = index.levels | ||
labels = index.labels | ||
|
||
def _make_index(lev, lab): | ||
values = _make_index_array_level(lev.values, lab) | ||
i = lev._simple_new(values, lev.name, | ||
freq=getattr(lev, 'freq', None), | ||
tz=getattr(lev, 'tz', None)) | ||
return i | ||
|
||
self.new_index_levels = [_make_index(lev, lab) | ||
for lev, lab in zip(levels, labels)] | ||
# when index includes `nan`, need to lift levels/strides by 1 | ||
self.lift = 1 if -1 in self.index.labels[self.level] else 0 | ||
|
||
self.new_index_levels = list(index.levels) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. perhaps use .copy() here rather than list. ? |
||
self.new_index_names = list(index.names) | ||
|
||
self.removed_name = self.new_index_names.pop(self.level) | ||
|
@@ -134,10 +126,10 @@ def _make_selectors(self): | |
ngroups = len(obs_ids) | ||
|
||
comp_index = _ensure_platform_int(comp_index) | ||
stride = self.index.levshape[self.level] | ||
stride = self.index.levshape[self.level] + self.lift | ||
self.full_shape = ngroups, stride | ||
|
||
selector = self.sorted_labels[-1] + stride * comp_index | ||
selector = self.sorted_labels[-1] + stride * comp_index + self.lift | ||
mask = np.zeros(np.prod(self.full_shape), dtype=bool) | ||
mask.put(selector, True) | ||
|
||
|
@@ -166,20 +158,6 @@ def get_result(self): | |
values = com.take_nd(values, inds, axis=1) | ||
columns = columns[inds] | ||
|
||
# we might have a missing index | ||
if len(index) != values.shape[0]: | ||
mask = isnull(index) | ||
if mask.any(): | ||
l = np.arange(len(index)) | ||
values, orig_values = (np.empty((len(index), values.shape[1])), | ||
values) | ||
values.fill(np.nan) | ||
values_indexer = com._ensure_int64(l[~mask]) | ||
for i, j in enumerate(values_indexer): | ||
values[j] = orig_values[i] | ||
else: | ||
index = index.take(self.unique_groups) | ||
|
||
# may need to coerce categoricals here | ||
if self.is_categorical is not None: | ||
values = [ Categorical.from_array(values[:,i], | ||
|
@@ -220,9 +198,16 @@ def get_new_values(self): | |
|
||
def get_new_columns(self): | ||
if self.value_columns is None: | ||
return self.removed_level | ||
if self.lift == 0: | ||
return self.removed_level | ||
|
||
lev = self.removed_level | ||
vals = np.insert(lev.astype('object'), 0, | ||
_get_na_value(lev.dtype.type)) | ||
|
||
return lev._shallow_copy(vals) | ||
|
||
stride = len(self.removed_level) | ||
stride = len(self.removed_level) + self.lift | ||
width = len(self.value_columns) | ||
propagator = np.repeat(np.arange(width), stride) | ||
if isinstance(self.value_columns, MultiIndex): | ||
|
@@ -231,59 +216,34 @@ def get_new_columns(self): | |
|
||
new_labels = [lab.take(propagator) | ||
for lab in self.value_columns.labels] | ||
new_labels.append(np.tile(np.arange(stride), width)) | ||
else: | ||
new_levels = [self.value_columns, self.removed_level] | ||
new_names = [self.value_columns.name, self.removed_name] | ||
new_labels = [propagator] | ||
|
||
new_labels = [] | ||
|
||
new_labels.append(propagator) | ||
new_labels.append(np.tile(np.arange(stride), width)) | ||
|
||
new_labels.append(np.tile(np.arange(stride) - self.lift, width)) | ||
return MultiIndex(levels=new_levels, labels=new_labels, | ||
names=new_names, verify_integrity=False) | ||
|
||
def get_new_index(self): | ||
result_labels = [] | ||
for cur in self.sorted_labels[:-1]: | ||
labels = cur.take(self.compressor) | ||
labels = _make_index_array_level(labels, cur) | ||
result_labels.append(labels) | ||
result_labels = [lab.take(self.compressor) | ||
for lab in self.sorted_labels[:-1]] | ||
|
||
# construct the new index | ||
if len(self.new_index_levels) == 1: | ||
new_index = self.new_index_levels[0] | ||
new_index.name = self.new_index_names[0] | ||
else: | ||
new_index = MultiIndex(levels=self.new_index_levels, | ||
labels=result_labels, | ||
names=self.new_index_names, | ||
verify_integrity=False) | ||
|
||
return new_index | ||
lev, lab = self.new_index_levels[0], result_labels[0] | ||
if not (lab == -1).any(): | ||
return lev.take(lab) | ||
|
||
vals = np.insert(lev.astype('object'), len(lev), | ||
_get_na_value(lev.dtype.type)).take(lab) | ||
|
||
def _make_index_array_level(lev, lab): | ||
""" create the combined index array, preserving nans, return an array """ | ||
mask = lab == -1 | ||
if not mask.any(): | ||
return lev | ||
|
||
l = np.arange(len(lab)) | ||
mask_labels = np.empty(len(mask[mask]), dtype=object) | ||
mask_labels.fill(_get_na_value(lev.dtype.type)) | ||
mask_indexer = com._ensure_int64(l[mask]) | ||
|
||
labels = lev | ||
labels_indexer = com._ensure_int64(l[~mask]) | ||
|
||
new_labels = np.empty(tuple([len(lab)]), dtype=object) | ||
new_labels[labels_indexer] = labels | ||
new_labels[mask_indexer] = mask_labels | ||
|
||
return new_labels | ||
return lev._shallow_copy(vals) | ||
|
||
return MultiIndex(levels=self.new_index_levels, | ||
labels=result_labels, | ||
names=self.new_index_names, | ||
verify_integrity=False) | ||
|
||
def _unstack_multiple(data, clocs): | ||
if len(clocs) == 0: | ||
|
@@ -483,29 +443,10 @@ def _unstack_frame(obj, level): | |
|
||
|
||
def get_compressed_ids(labels, sizes): | ||
# no overflow | ||
if com._long_prod(sizes) < 2 ** 63: | ||
group_index = get_group_index(labels, sizes) | ||
comp_index, obs_ids = _compress_group_index(group_index) | ||
else: | ||
n = len(labels[0]) | ||
mask = np.zeros(n, dtype=bool) | ||
for v in labels: | ||
mask |= v < 0 | ||
|
||
while com._long_prod(sizes) >= 2 ** 63: | ||
i = len(sizes) | ||
while com._long_prod(sizes[:i]) >= 2 ** 63: | ||
i -= 1 | ||
|
||
rem_index, rem_ids = get_compressed_ids(labels[:i], | ||
sizes[:i]) | ||
sizes = [len(rem_ids)] + sizes[i:] | ||
labels = [rem_index] + labels[i:] | ||
|
||
return get_compressed_ids(labels, sizes) | ||
from pandas.core.groupby import get_flat_ids | ||
|
||
return comp_index, obs_ids | ||
ids = get_flat_ids(labels, sizes, True) | ||
return _compress_group_index(ids, sort=True) | ||
|
||
|
||
def stack(frame, level=-1, dropna=True): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,7 +11,7 @@ | |
import nose | ||
import functools | ||
import itertools | ||
from itertools import product | ||
from itertools import product, permutations | ||
from distutils.version import LooseVersion | ||
|
||
from pandas.compat import( | ||
|
@@ -12334,6 +12334,53 @@ def test_unstack_non_unique_index_names(self): | |
with tm.assertRaises(ValueError): | ||
df.T.stack('c1') | ||
|
||
def test_unstack_nan_index(self): # GH7466 | ||
cast = lambda val: '{0:1}'.format('' if val != val else val) | ||
nan = np.nan | ||
|
||
def verify(df): | ||
mk_list = lambda a: list(a) if isinstance(a, tuple) else [a] | ||
rows, cols = df.notnull().values.nonzero() | ||
for i, j in zip(rows, cols): | ||
left = sorted(df.iloc[i, j].split('.')) | ||
right = mk_list(df.index[i]) + mk_list(df.columns[j]) | ||
right = sorted(list(map(cast, right))) | ||
self.assertEqual(left, right) | ||
|
||
df = DataFrame({'jim':['a', 'b', nan, 'd'], | ||
'joe':['w', 'x', 'y', 'z'], | ||
'jolie':['a.w', 'b.x', ' .y', 'd.z']}) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pls give a tests for multiple nans in a single level and a single nan but in multiple levels. (which you prob just raise a ValueError as it cannot be computed). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. actually I see you do handle this. BUT, the problem is the resulting index are full of nan levels. Hmm. This should be a warning or maybe an option to unstack/pivot. You get a useful result, but then indexing will blow up. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
@jreback no! there will be no nan in the levels and there is no nan level. see the last lines in the very first comment, or check There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it is not a question of performance but of subsequent indexing. Having nan in the levels is currently restricted to a single nan for indexing. They make the indexes non-unique by definition and when you try to look up the locations they will fail. You can only do positional type indexing. So you leave the user with an odd structure that only partially works. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. my point was that this is handling nan without polluting the levels or impacting performance. comments addressed, and all came green. |
||
|
||
left = df.set_index(['jim', 'joe']).unstack()['jolie'] | ||
right = df.set_index(['joe', 'jim']).unstack()['jolie'].T | ||
assert_frame_equal(left, right) | ||
|
||
for idx in permutations(df.columns[:2]): | ||
mi = df.set_index(list(idx)) | ||
for lev in range(2): | ||
udf = mi.unstack(level=lev) | ||
self.assertEqual(udf.notnull().values.sum(), len(df)) | ||
verify(udf['jolie']) | ||
|
||
df = DataFrame({'1st':['d'] * 3 + [nan] * 5 + ['a'] * 2 + | ||
['c'] * 3 + ['e'] * 2 + ['b'] * 5, | ||
'2nd':['y'] * 2 + ['w'] * 3 + [nan] * 3 + | ||
['z'] * 4 + [nan] * 3 + ['x'] * 3 + [nan] * 2, | ||
'3rd':[67,39,53,72,57,80,31,18,11,30,59, | ||
50,62,59,76,52,14,53,60,51]}) | ||
|
||
df['4th'], df['5th'] = \ | ||
df.apply(lambda r: '.'.join(map(cast, r)), axis=1), \ | ||
df.apply(lambda r: '.'.join(map(cast, r.iloc[::-1])), axis=1) | ||
|
||
for idx in permutations(['1st', '2nd', '3rd']): | ||
mi = df.set_index(list(idx)) | ||
for lev in range(3): | ||
udf = mi.unstack(level=lev) | ||
self.assertEqual(udf.notnull().values.sum(), 2 * len(df)) | ||
for col in ['4th', '5th']: | ||
verify(udf[col]) | ||
|
||
def test_stack_datetime_column_multiIndex(self): | ||
# GH 8039 | ||
t = datetime(2014, 1, 1) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you add a
Parameters/Returns
block