diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index 7433adaa4b738..66b772d35f2e2 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -67,6 +67,7 @@ Bug Fixes - Bug in ``MultiIndex.has_duplicates`` when having many levels causes an indexer overflow (:issue:`9075`) +- Bug in ``pivot`` and `unstack`` where ``nan`` values would break index alignment (:issue:`7466`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 4c221cc27fdce..888dca3914b53 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -6,7 +6,7 @@ from pandas.compat import( zip, builtins, range, long, lzip, - OrderedDict, callable + OrderedDict, callable, filter, map ) from pandas import compat @@ -3510,6 +3510,61 @@ def get_group_index(label_list, shape): np.putmask(group_index, mask, -1) return group_index + +def get_flat_ids(labels, shape, retain_lex_rank): + """ + Given a list of labels at each level, returns a flat array of int64 ids + corresponding to unique tuples across the labels. If `retain_lex_rank`, + rank of returned ids preserve lexical ranks of labels. + + Parameters + ---------- + labels: sequence of arrays + Integers identifying levels at each location + shape: sequence of ints same length as labels + Number of unique levels at each location + retain_lex_rank: boolean + If the ranks of returned ids should match lexical ranks of labels + + Returns + ------- + An array of type int64 where two elements are equal if their corresponding + labels are equal at all location. + """ + def loop(labels, shape): + # how many levels can be done without overflow: + pred = lambda i: not _int64_overflow_possible(shape[:i]) + nlev = next(filter(pred, range(len(shape), 0, -1))) + + # compute flat ids for the first `nlev` levels + stride = np.prod(shape[1:nlev], dtype='i8') + out = stride * labels[0].astype('i8', subok=False, copy=False) + + for i in range(1, nlev): + stride //= shape[i] + out += labels[i] * stride + + if nlev == len(shape): # all levels done! + return out + + # compress what has been done so far in order to avoid overflow + # to retain lexical ranks, obs_ids should be sorted + comp_ids, obs_ids = _compress_group_index(out, sort=retain_lex_rank) + + labels = [comp_ids] + labels[nlev:] + shape = [len(obs_ids)] + shape[nlev:] + + return loop(labels, shape) + + def maybe_lift(lab, size): # pormote nan values + return (lab + 1, size + 1) if (lab == -1).any() else (lab, size) + + labels = map(com._ensure_int64, labels) + labels, shape = map(list, zip(*map(maybe_lift, labels, shape))) + + return loop(labels, shape) + + _INT64_MAX = np.iinfo(np.int64).max diff --git a/pandas/core/index.py b/pandas/core/index.py index d2a3093e686a7..97890299657cf 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -3226,44 +3226,13 @@ def _has_complex_internals(self): @cache_readonly def is_unique(self): from pandas.hashtable import Int64HashTable - - def _get_group_index(labels, shape): - from pandas.core.groupby import _int64_overflow_possible, \ - _compress_group_index - - # how many levels can be done without overflow - pred = lambda i: not _int64_overflow_possible(shape[:i]) - nlev = next(filter(pred, range(len(shape), 0, -1))) - - # compute group indicies for the first `nlev` levels - group_index = labels[0].astype('i8', subok=False, copy=True) - stride = shape[0] - - for i in range(1, nlev): - group_index += labels[i] * stride - stride *= shape[i] - - if nlev == len(shape): - return group_index - - comp_ids, obs_ids = _compress_group_index(group_index, sort=False) - - labels = [comp_ids] + labels[nlev:] - shape = [len(obs_ids)] + shape[nlev:] - - return _get_group_index(labels, shape) - - def _maybe_lift(lab, size): # pormote nan values - return (lab + 1, size + 1) if (lab == -1).any() else (lab, size) + from pandas.core.groupby import get_flat_ids shape = map(len, self.levels) - labels = map(_ensure_int64, self.labels) - - labels, shape = map(list, zip(*map(_maybe_lift, labels, shape))) - group_index = _get_group_index(labels, shape) + ids = get_flat_ids(self.labels, shape, False) + table = Int64HashTable(min(1 << 20, len(ids))) - table = Int64HashTable(min(1 << 20, len(group_index))) - return len(table.unique(group_index)) == len(self) + return len(table.unique(ids)) == len(self) def get_value(self, series, key): # somewhat broken encapsulation diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 5ed823d690028..19208506fdc72 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -82,18 +82,10 @@ def __init__(self, values, index, level=-1, value_columns=None): self.level = self.index._get_level_number(level) - levels = index.levels - labels = index.labels - - def _make_index(lev, lab): - values = _make_index_array_level(lev.values, lab) - i = lev._simple_new(values, lev.name, - freq=getattr(lev, 'freq', None), - tz=getattr(lev, 'tz', None)) - return i - - self.new_index_levels = [_make_index(lev, lab) - for lev, lab in zip(levels, labels)] + # when index includes `nan`, need to lift levels/strides by 1 + self.lift = 1 if -1 in self.index.labels[self.level] else 0 + + self.new_index_levels = list(index.levels) self.new_index_names = list(index.names) self.removed_name = self.new_index_names.pop(self.level) @@ -134,10 +126,10 @@ def _make_selectors(self): ngroups = len(obs_ids) comp_index = _ensure_platform_int(comp_index) - stride = self.index.levshape[self.level] + stride = self.index.levshape[self.level] + self.lift self.full_shape = ngroups, stride - selector = self.sorted_labels[-1] + stride * comp_index + selector = self.sorted_labels[-1] + stride * comp_index + self.lift mask = np.zeros(np.prod(self.full_shape), dtype=bool) mask.put(selector, True) @@ -166,20 +158,6 @@ def get_result(self): values = com.take_nd(values, inds, axis=1) columns = columns[inds] - # we might have a missing index - if len(index) != values.shape[0]: - mask = isnull(index) - if mask.any(): - l = np.arange(len(index)) - values, orig_values = (np.empty((len(index), values.shape[1])), - values) - values.fill(np.nan) - values_indexer = com._ensure_int64(l[~mask]) - for i, j in enumerate(values_indexer): - values[j] = orig_values[i] - else: - index = index.take(self.unique_groups) - # may need to coerce categoricals here if self.is_categorical is not None: values = [ Categorical.from_array(values[:,i], @@ -220,9 +198,16 @@ def get_new_values(self): def get_new_columns(self): if self.value_columns is None: - return self.removed_level + if self.lift == 0: + return self.removed_level + + lev = self.removed_level + vals = np.insert(lev.astype('object'), 0, + _get_na_value(lev.dtype.type)) + + return lev._shallow_copy(vals) - stride = len(self.removed_level) + stride = len(self.removed_level) + self.lift width = len(self.value_columns) propagator = np.repeat(np.arange(width), stride) if isinstance(self.value_columns, MultiIndex): @@ -231,59 +216,34 @@ def get_new_columns(self): new_labels = [lab.take(propagator) for lab in self.value_columns.labels] - new_labels.append(np.tile(np.arange(stride), width)) else: new_levels = [self.value_columns, self.removed_level] new_names = [self.value_columns.name, self.removed_name] + new_labels = [propagator] - new_labels = [] - - new_labels.append(propagator) - new_labels.append(np.tile(np.arange(stride), width)) - + new_labels.append(np.tile(np.arange(stride) - self.lift, width)) return MultiIndex(levels=new_levels, labels=new_labels, names=new_names, verify_integrity=False) def get_new_index(self): - result_labels = [] - for cur in self.sorted_labels[:-1]: - labels = cur.take(self.compressor) - labels = _make_index_array_level(labels, cur) - result_labels.append(labels) + result_labels = [lab.take(self.compressor) + for lab in self.sorted_labels[:-1]] # construct the new index if len(self.new_index_levels) == 1: - new_index = self.new_index_levels[0] - new_index.name = self.new_index_names[0] - else: - new_index = MultiIndex(levels=self.new_index_levels, - labels=result_labels, - names=self.new_index_names, - verify_integrity=False) - - return new_index + lev, lab = self.new_index_levels[0], result_labels[0] + if not (lab == -1).any(): + return lev.take(lab) + vals = np.insert(lev.astype('object'), len(lev), + _get_na_value(lev.dtype.type)).take(lab) -def _make_index_array_level(lev, lab): - """ create the combined index array, preserving nans, return an array """ - mask = lab == -1 - if not mask.any(): - return lev - - l = np.arange(len(lab)) - mask_labels = np.empty(len(mask[mask]), dtype=object) - mask_labels.fill(_get_na_value(lev.dtype.type)) - mask_indexer = com._ensure_int64(l[mask]) - - labels = lev - labels_indexer = com._ensure_int64(l[~mask]) - - new_labels = np.empty(tuple([len(lab)]), dtype=object) - new_labels[labels_indexer] = labels - new_labels[mask_indexer] = mask_labels - - return new_labels + return lev._shallow_copy(vals) + return MultiIndex(levels=self.new_index_levels, + labels=result_labels, + names=self.new_index_names, + verify_integrity=False) def _unstack_multiple(data, clocs): if len(clocs) == 0: @@ -483,29 +443,10 @@ def _unstack_frame(obj, level): def get_compressed_ids(labels, sizes): - # no overflow - if com._long_prod(sizes) < 2 ** 63: - group_index = get_group_index(labels, sizes) - comp_index, obs_ids = _compress_group_index(group_index) - else: - n = len(labels[0]) - mask = np.zeros(n, dtype=bool) - for v in labels: - mask |= v < 0 - - while com._long_prod(sizes) >= 2 ** 63: - i = len(sizes) - while com._long_prod(sizes[:i]) >= 2 ** 63: - i -= 1 - - rem_index, rem_ids = get_compressed_ids(labels[:i], - sizes[:i]) - sizes = [len(rem_ids)] + sizes[i:] - labels = [rem_index] + labels[i:] - - return get_compressed_ids(labels, sizes) + from pandas.core.groupby import get_flat_ids - return comp_index, obs_ids + ids = get_flat_ids(labels, sizes, True) + return _compress_group_index(ids, sort=True) def stack(frame, level=-1, dropna=True): diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index a19a32ea793ba..fcbfb21bd20e3 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -11,7 +11,7 @@ import nose import functools import itertools -from itertools import product +from itertools import product, permutations from distutils.version import LooseVersion from pandas.compat import( @@ -12334,6 +12334,53 @@ def test_unstack_non_unique_index_names(self): with tm.assertRaises(ValueError): df.T.stack('c1') + def test_unstack_nan_index(self): # GH7466 + cast = lambda val: '{0:1}'.format('' if val != val else val) + nan = np.nan + + def verify(df): + mk_list = lambda a: list(a) if isinstance(a, tuple) else [a] + rows, cols = df.notnull().values.nonzero() + for i, j in zip(rows, cols): + left = sorted(df.iloc[i, j].split('.')) + right = mk_list(df.index[i]) + mk_list(df.columns[j]) + right = sorted(list(map(cast, right))) + self.assertEqual(left, right) + + df = DataFrame({'jim':['a', 'b', nan, 'd'], + 'joe':['w', 'x', 'y', 'z'], + 'jolie':['a.w', 'b.x', ' .y', 'd.z']}) + + left = df.set_index(['jim', 'joe']).unstack()['jolie'] + right = df.set_index(['joe', 'jim']).unstack()['jolie'].T + assert_frame_equal(left, right) + + for idx in permutations(df.columns[:2]): + mi = df.set_index(list(idx)) + for lev in range(2): + udf = mi.unstack(level=lev) + self.assertEqual(udf.notnull().values.sum(), len(df)) + verify(udf['jolie']) + + df = DataFrame({'1st':['d'] * 3 + [nan] * 5 + ['a'] * 2 + + ['c'] * 3 + ['e'] * 2 + ['b'] * 5, + '2nd':['y'] * 2 + ['w'] * 3 + [nan] * 3 + + ['z'] * 4 + [nan] * 3 + ['x'] * 3 + [nan] * 2, + '3rd':[67,39,53,72,57,80,31,18,11,30,59, + 50,62,59,76,52,14,53,60,51]}) + + df['4th'], df['5th'] = \ + df.apply(lambda r: '.'.join(map(cast, r)), axis=1), \ + df.apply(lambda r: '.'.join(map(cast, r.iloc[::-1])), axis=1) + + for idx in permutations(['1st', '2nd', '3rd']): + mi = df.set_index(list(idx)) + for lev in range(3): + udf = mi.unstack(level=lev) + self.assertEqual(udf.notnull().values.sum(), 2 * len(df)) + for col in ['4th', '5th']: + verify(udf[col]) + def test_stack_datetime_column_multiIndex(self): # GH 8039 t = datetime(2014, 1, 1) diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index 23350b203ee50..39d189b7de52b 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -173,13 +173,16 @@ def test_pivot_multi_functions(self): def test_pivot_index_with_nan(self): # GH 3588 nan = np.nan - df = DataFrame({"a":['R1', 'R2', nan, 'R4'], 'b':["C1", "C2", "C3" , "C4"], "c":[10, 15, nan , 20]}) + df = DataFrame({'a':['R1', 'R2', nan, 'R4'], + 'b':['C1', 'C2', 'C3' , 'C4'], + 'c':[10, 15, 17, 20]}) result = df.pivot('a','b','c') - expected = DataFrame([[nan,nan,nan,nan],[nan,10,nan,nan], - [nan,nan,nan,nan],[nan,nan,15,20]], - index = Index(['R1','R2',nan,'R4'],name='a'), + expected = DataFrame([[nan,nan,17,nan],[10,nan,nan,nan], + [nan,15,nan,nan],[nan,nan,nan,20]], + index = Index([nan,'R1','R2','R4'],name='a'), columns = Index(['C1','C2','C3','C4'],name='b')) tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(df.pivot('b', 'a', 'c'), expected.T) def test_pivot_with_tz(self): # GH 5878 @@ -268,12 +271,12 @@ def _check_output(res, col, index=['A', 'B'], columns=['C']): # issue number #8349: pivot_table with margins and dictionary aggfunc - df=DataFrame([ {'JOB':'Worker','NAME':'Bob' ,'YEAR':2013,'MONTH':12,'DAYS': 3,'SALARY': 17}, - {'JOB':'Employ','NAME':'Mary','YEAR':2013,'MONTH':12,'DAYS': 5,'SALARY': 23}, - {'JOB':'Worker','NAME':'Bob' ,'YEAR':2014,'MONTH': 1,'DAYS':10,'SALARY':100}, - {'JOB':'Worker','NAME':'Bob' ,'YEAR':2014,'MONTH': 1,'DAYS':11,'SALARY':110}, - {'JOB':'Employ','NAME':'Mary','YEAR':2014,'MONTH': 1,'DAYS':15,'SALARY':200}, - {'JOB':'Worker','NAME':'Bob' ,'YEAR':2014,'MONTH': 2,'DAYS': 8,'SALARY': 80}, + df=DataFrame([ {'JOB':'Worker','NAME':'Bob' ,'YEAR':2013,'MONTH':12,'DAYS': 3,'SALARY': 17}, + {'JOB':'Employ','NAME':'Mary','YEAR':2013,'MONTH':12,'DAYS': 5,'SALARY': 23}, + {'JOB':'Worker','NAME':'Bob' ,'YEAR':2014,'MONTH': 1,'DAYS':10,'SALARY':100}, + {'JOB':'Worker','NAME':'Bob' ,'YEAR':2014,'MONTH': 1,'DAYS':11,'SALARY':110}, + {'JOB':'Employ','NAME':'Mary','YEAR':2014,'MONTH': 1,'DAYS':15,'SALARY':200}, + {'JOB':'Worker','NAME':'Bob' ,'YEAR':2014,'MONTH': 2,'DAYS': 8,'SALARY': 80}, {'JOB':'Employ','NAME':'Mary','YEAR':2014,'MONTH': 2,'DAYS': 5,'SALARY':190} ]) df=df.set_index(['JOB','NAME','YEAR','MONTH'],drop=False,append=False)