diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e92de770ac4bd..e950df9c633af 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3718,7 +3718,7 @@ def pivot(self, index=None, columns=None, values=None): from pandas.core.reshape import pivot return pivot(self, index=index, columns=columns, values=values) - def stack(self, level=-1, dropna=True): + def stack(self, level=-1, dropna=True, sequentially=True): """ Pivot a level of the (possibly hierarchical) column labels, returning a DataFrame (or Series in the case of an object with a single level of @@ -3728,11 +3728,15 @@ def stack(self, level=-1, dropna=True): Parameters ---------- - level : int, string, or list of these, default last level - Level(s) to stack, can pass level name + level : int, string, list of these, or None; default -1 (last level) + Level(s) to stack, can pass level name(s). + None specifies all column levels, i.e. list(range(columns.nlevels)). dropna : boolean, default True Whether to drop rows in the resulting Frame/Series with no valid values + sequentially : boolean, default True + When level is a list (or None), whether the multiple column levels + should be stacked sequentially (if True) or simultaneously (if False). Examples ---------- @@ -3751,14 +3755,20 @@ def stack(self, level=-1, dropna=True): ------- stacked : DataFrame or Series """ - from pandas.core.reshape import stack, stack_multiple + from pandas.core.reshape import stack_levels_sequentially, stack_multi_levels_simultaneously - if isinstance(level, (tuple, list)): - return stack_multiple(self, level, dropna=dropna) + level_nums = self.columns._get_level_numbers(level, allow_mixed_names_and_numbers=False) + if level_nums == []: + if dropna: + return self.dropna(axis=0, how='all') + else: + return self + elif (not sequentially) and isinstance(self.columns, MultiIndex): + return stack_multi_levels_simultaneously(self, level_nums, dropna=dropna) else: - return stack(self, level, dropna=dropna) + return stack_levels_sequentially(self, level_nums, dropna=dropna) - def unstack(self, level=-1): + def unstack(self, level=-1, dropna=False, sequentially=False): """ Pivot a level of the (necessarily hierarchical) index labels, returning a DataFrame having a new level of column labels whose inner-most level @@ -3769,8 +3779,15 @@ def unstack(self, level=-1): Parameters ---------- - level : int, string, or list of these, default -1 (last level) - Level(s) of index to unstack, can pass level name + level : int, string, list of these, or None; default -1 (last level) + Level(s) of index to unstack, can pass level name(s). + None specifies all index levels, i.e. list(range(index.nlevels)). + dropna : boolean, default False + Whether to drop columns in the resulting Frame/Series with no valid + values + sequentially : boolean, default True + When level is a list (or None), whether the multiple index levels + should be stacked sequentially (if True) or simultaneously (if False). See also -------- @@ -3812,7 +3829,44 @@ def unstack(self, level=-1): unstacked : DataFrame or Series """ from pandas.core.reshape import unstack - return unstack(self, level) + + level_nums = self.index._get_level_numbers(level, allow_mixed_names_and_numbers=False) + if level_nums == []: + if dropna: + return self.dropna(axis=1, how='all') + else: + return self + if sequentially and isinstance(level_nums, list) and (len(level_nums) > 1): + result = self + # Adjust level_nums to account for the fact that levels move "up" + # as a result of stacking of earlier levels. + adjusted_level_nums = [x - sum((y < x) for y in level_nums[:i]) + for i, x in enumerate(level_nums)] + for level_num in adjusted_level_nums: + result = unstack(result, level_num) + else: + result = unstack(self, level_nums) + + if isinstance(result, DataFrame): + # fix dtypes, if necessary + desired_dtypes = self.dtypes.values.repeat(len(result.columns) // len(self.columns)) + result_dtypes = result.dtypes.values + for i, c in enumerate(result.columns): + if result_dtypes[i] != desired_dtypes[i]: + if result_dtypes[i] == np.object: + # use default Series constructor to set type + result[c] = Series(result[c].values.tolist(), index=result.index) + else: + # try to convert type directly + result[c] = result[c].astype(desired_dtypes[i], raise_on_error=False) + # drop empty columns, if necessary + if dropna: + result = result.dropna(axis=1, how='all') + else: + if dropna: + result = result.dropna() + + return result #---------------------------------------------------------------------- # Time series-related diff --git a/pandas/core/index.py b/pandas/core/index.py index b4c690fe8973b..86afc922fb8db 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1033,7 +1033,7 @@ def _validate_index_level(self, level): verification must be done like in MultiIndex. """ - if isinstance(level, int): + if com.is_integer(level): if level < 0 and level != -1: raise IndexError("Too many levels: Index has only 1 level," " %d is not a valid level number" % (level,)) @@ -1045,10 +1045,44 @@ def _validate_index_level(self, level): raise KeyError('Level %s must be same as name (%s)' % (level, self.name)) - def _get_level_number(self, level): + def _get_level_number(self, level, ignore_names=False): + """ + Returns level number corresponding to level. + If level is a level name and ignore_names is False, + the level number corresponding to such level name is returned. + Otherwise level must be a number. + If level is a positive number, it is returned. + If level is a negative number, its sum with self.nlevels is returned. + """ + if ignore_names and (not com.is_integer(level)): + raise KeyError('Level %s not found' % str(level)) self._validate_index_level(level) return 0 + def _get_level_numbers(self, levels, allow_mixed_names_and_numbers=False): + """ + Returns level numbers corresponding to levels. + If levels is None, a list of all level numbers is returned. + If levels is a single number or level name, + then a single number is returned (using _get_level_number()). + If levels is a list of numbers or level names, + then a list of numbers is returned (each using _get_level_number()). + If allow_mixed_names_and_numbers is False, then levels must be + either all level numbers or all level names. + """ + if levels is None: + return list(range(self.nlevels)) + elif isinstance(levels, (list, tuple, set)): + if (not allow_mixed_names_and_numbers) and (not all(lev in self.names for lev in levels)): + if all(isinstance(lev, int) for lev in levels): + return type(levels)(self._get_level_number(level, ignore_names=True) for level in levels) + else: + raise ValueError("level should contain all level names or all level numbers, " + "not a mixture of the two.") + return type(levels)(self._get_level_number(level) for level in levels) + else: + return self._get_level_number(levels) + @cache_readonly def inferred_type(self): """ return a string of the type inferred from the values """ @@ -4294,28 +4328,38 @@ def _from_elements(values, labels=None, levels=None, names=None, sortorder=None): return MultiIndex(levels, labels, names, sortorder=sortorder) - def _get_level_number(self, level): - try: + def _get_level_number(self, level, ignore_names=False): + """ + Returns level number corresponding to level. + If level is a level name and ignore_names is False, + the level number corresponding to such level name is returned. + Otherwise level must be a number. + If level is a positive number, it is returned. + If level is a negative number, its sum with self.nlevels is returned. + """ + if not ignore_names: count = self.names.count(level) if count > 1: raise ValueError('The name %s occurs multiple times, use a ' 'level number' % level) - level = self.names.index(level) - except ValueError: - if not isinstance(level, int): - raise KeyError('Level %s not found' % str(level)) - elif level < 0: - level += self.nlevels - if level < 0: - orig_level = level - self.nlevels - raise IndexError( - 'Too many levels: Index has only %d levels, ' - '%d is not a valid level number' % (self.nlevels, orig_level) - ) - # Note: levels are zero-based - elif level >= self.nlevels: - raise IndexError('Too many levels: Index has only %d levels, ' - 'not %d' % (self.nlevels, level + 1)) + try: + return self.names.index(level) + except ValueError: + pass + if not com.is_integer(level): + raise KeyError('Level %s not found' % str(level)) + elif level < 0: + level += self.nlevels + if level < 0: + orig_level = level - self.nlevels + raise IndexError( + 'Too many levels: Index has only %d levels, ' + '%d is not a valid level number' % (self.nlevels, orig_level) + ) + # Note: levels are zero-based + elif level >= self.nlevels: + raise IndexError('Too many levels: Index has only %d levels, ' + 'not %d' % (self.nlevels, level + 1)) return level _tuples = None @@ -4891,7 +4935,7 @@ def _drop_from_level(self, labels, level): return self[mask] - def droplevel(self, level=0): + def droplevel(self, level=0, ignore_names=False): """ Return Index with requested level removed. If MultiIndex has only 2 levels, the result will be of Index type not MultiIndex. @@ -4899,6 +4943,8 @@ def droplevel(self, level=0): Parameters ---------- level : int/level name or list thereof + ignore_names : boolean, default True + If True, level must be an int or list thereof Notes ----- @@ -4916,7 +4962,7 @@ def droplevel(self, level=0): new_labels = list(self.labels) new_names = list(self.names) - levnums = sorted(self._get_level_number(lev) for lev in levels)[::-1] + levnums = sorted((self._get_level_number(lev, ignore_names) for lev in levels), reverse=True) for i in levnums: new_levels.pop(i) @@ -4929,6 +4975,9 @@ def droplevel(self, level=0): mask = new_labels[0] == -1 result = new_levels[0].take(new_labels[0]) if mask.any(): + if result.is_integer(): + # cannot store NaNs in an integer index, so promote to Float64Index + result = Float64Index(result.values, name=result.name) result = result.putmask(mask, np.nan) result.name = new_names[0] @@ -5539,7 +5588,7 @@ def convert_indexer(start, stop, step, indexer=indexer, labels=labels): else: - loc = level_index.get_loc(key) + loc = -1 if com.is_float(key) and np.isnan(key) else level_index.get_loc(key) if level > 0 or self.lexsort_depth == 0: return np.array(labels == loc,dtype=bool) else: @@ -6050,7 +6099,7 @@ def _trim_front(strings): def _sanitize_and_check(indexes): - kinds = list(set([type(index) for index in indexes])) + kinds = list(set(type(index) for index in indexes)) if list in kinds: if len(kinds) > 1: @@ -6071,11 +6120,11 @@ def _get_consensus_names(indexes): # find the non-none names, need to tupleify to make # the set hashable, then reverse on return - consensus_names = set([ + consensus_names = set( tuple(i.names) for i in indexes if all(n is not None for n in i.names) - ]) + ) if len(consensus_names) == 1: - return list(list(consensus_names)[0]) + return list(consensus_names.pop()) return [None] * indexes[0].nlevels diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index fecfe5cd82c6d..6940e36b23f4a 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -21,6 +21,8 @@ import pandas.algos as algos from pandas.core.index import MultiIndex, _get_na_value +from pandas.core.algorithms import factorize, unique +from pandas.tslib import NaTType class _Unstacker(object): @@ -61,8 +63,25 @@ class _Unstacker(object): unstacked : DataFrame """ - def __init__(self, values, index, level=-1, value_columns=None): - + def __init__(self, values, index, level_num, value_columns=None): + """ + Initializes _Unstacker object. + + Parameters + ---------- + values : ndarray + Values to use for populating new frame's values + index : ndarray + Labels to use to make new frame's index + level_num : int + Level to unstack, must be an integer in the range [0, len(index)) + value_columns : ndarray + Labels to use to make new frame's columns + + Notes + ----- + Obviously, values, index, and values_columns must have the same length + """ self.is_categorical = None if values.ndim == 1: if isinstance(values, Categorical): @@ -77,13 +96,7 @@ def __init__(self, values, index, level=-1, value_columns=None): self.index = index - if isinstance(self.index, MultiIndex): - if index._reference_duplicate_name(level): - msg = ("Ambiguous reference to {0}. The index " - "names are not unique.".format(level)) - raise ValueError(msg) - - self.level = self.index._get_level_number(level) + self.level = level_num # when index includes `nan`, need to lift levels/strides by 1 self.lift = 1 if -1 in self.index.labels[self.level] else 0 @@ -239,6 +252,26 @@ def get_new_index(self): verify_integrity=False) +def _make_new_index(lev, lab): + from pandas.core.index import Index, _get_na_value + + nan = _get_na_value(lev.dtype.type) + vals = lev.values.astype('object') + vals = np.insert(vals, 0, nan) if lab is None else \ + np.insert(vals, len(vals), nan).take(lab) + + if com.is_datetime_or_timedelta_dtype(lev.dtype): + nan_indices = [0] if lab is None else (np.array(lab) == -1) + vals[nan_indices] = None + + try: + vals = vals.astype(lev.dtype, subok=False, copy=False) + except ValueError: + return Index(vals, **lev._get_attributes_dict()) + + return lev._shallow_copy(vals) + + def _unstack_multiple(data, clocs): from pandas.core.groupby import decons_obs_group_ids @@ -249,8 +282,6 @@ def _unstack_multiple(data, clocs): index = data.index - clocs = [index._get_level_number(i) for i in clocs] - rlocs = [i for i in range(index.nlevels) if i not in clocs] clevels = [index.levels[i] for i in clocs] @@ -395,26 +426,30 @@ def _slow_pivot(index, columns, values): return DataFrame(tree) -def unstack(obj, level): - if isinstance(level, (tuple, list)): - return _unstack_multiple(obj, level) +def unstack(obj, level_num): + if isinstance(level_num, (tuple, list)): + if len(level_num) == 1: + level_num = level_num[0] + else: + return _unstack_multiple(obj, level_num) if isinstance(obj, DataFrame): if isinstance(obj.index, MultiIndex): - return _unstack_frame(obj, level) + return _unstack_frame(obj, level_num) else: - return obj.T.stack(dropna=False) + #return obj.T.stack(dropna=False) + return stack_single_level(obj.T, 0, dropna=False) else: - unstacker = _Unstacker(obj.values, obj.index, level=level) + unstacker = _Unstacker(obj.values, obj.index, level_num=level_num) return unstacker.get_result() -def _unstack_frame(obj, level): +def _unstack_frame(obj, level_num): from pandas.core.internals import BlockManager, make_block if obj._is_mixed_type: unstacker = _Unstacker(np.empty(obj.shape, dtype=bool), # dummy - obj.index, level=level, + obj.index, level_num=level_num, value_columns=obj.columns) new_columns = unstacker.get_new_columns() new_index = unstacker.get_new_index() @@ -424,7 +459,7 @@ def _unstack_frame(obj, level): mask_blocks = [] for blk in obj._data.blocks: blk_items = obj._data.items[blk.mgr_locs.indexer] - bunstacker = _Unstacker(blk.values.T, obj.index, level=level, + bunstacker = _Unstacker(blk.values.T, obj.index, level_num=level_num, value_columns=blk_items) new_items = bunstacker.get_new_columns() new_placement = new_columns.get_indexer(new_items) @@ -440,7 +475,7 @@ def _unstack_frame(obj, level): mask_frame = DataFrame(BlockManager(mask_blocks, new_axes)) return result.ix[:, mask_frame.sum(0) > 0] else: - unstacker = _Unstacker(obj.values, obj.index, level=level, + unstacker = _Unstacker(obj.values, obj.index, level_num=level_num, value_columns=obj.columns) return unstacker.get_result() @@ -452,54 +487,45 @@ def get_compressed_ids(labels, sizes): return _compress_group_index(ids, sort=True) -def stack(frame, level=-1, dropna=True): +def stack_single_level(frame, level_num, dropna=True): """ - Convert DataFrame to Series with multi-level Index. Columns become the - second level of the resulting hierarchical index + Convert DataFrame to DataFrame or Series with multi-level Index. + Columns become the second level of the resulting hierarchical index + + Parameters + ---------- + frame : DataFrame + DataFrame to be unstacked + level_num : int + Column level to unstack, must be an integer in the range [0, len(index)) + dropna : boolean, default True + Whether to drop rows in the resulting Frame/Series with no valid + values Returns ------- - stacked : Series + stacked : DataFrame or Series """ - def factorize(index): - if index.is_unique: - return index, np.arange(len(index)) - cat = Categorical(index, ordered=True) - return cat.categories, cat.codes - - N, K = frame.shape if isinstance(frame.columns, MultiIndex): - if frame.columns._reference_duplicate_name(level): - msg = ("Ambiguous reference to {0}. The column " - "names are not unique.".format(level)) - raise ValueError(msg) + return stack_multi_levels_simultaneously(frame, level_nums=[level_num], dropna=dropna) - # Will also convert negative level numbers and check if out of bounds. - level_num = frame.columns._get_level_number(level) - - if isinstance(frame.columns, MultiIndex): - return _stack_multi_columns(frame, level_num=level_num, dropna=dropna) - elif isinstance(frame.index, MultiIndex): + # frame.columns is a simple Index (not a MultiIndex) + N, K = frame.shape + if isinstance(frame.index, MultiIndex): new_levels = list(frame.index.levels) new_labels = [lab.repeat(K) for lab in frame.index.labels] - - clev, clab = factorize(frame.columns) - new_levels.append(clev) - new_labels.append(np.tile(clab, N).ravel()) - new_names = list(frame.index.names) - new_names.append(frame.columns.name) - new_index = MultiIndex(levels=new_levels, labels=new_labels, - names=new_names, verify_integrity=False) else: - levels, (ilab, clab) = \ - zip(*map(factorize, (frame.index, frame.columns))) - labels = ilab.repeat(K), np.tile(clab, N).ravel() - new_index = MultiIndex(levels=levels, - labels=labels, - names=[frame.index.name, frame.columns.name], - verify_integrity=False) - + idx_labels, new_levels = factorize(frame.index) + new_levels = [new_levels] + new_labels = [idx_labels.repeat(K)] + new_names = [frame.index.name] + col_labels, col_levels = factorize(frame.columns) + new_levels.append(col_levels) + new_labels.append(np.tile(col_labels, N).ravel()) + new_names.append(frame.columns.name) + new_index = MultiIndex(levels=new_levels, labels=new_labels, + names=new_names, verify_integrity=False) new_values = frame.values.ravel() if dropna: mask = notnull(new_values) @@ -508,46 +534,29 @@ def factorize(index): return Series(new_values, index=new_index) -def stack_multiple(frame, level, dropna=True): - # If all passed levels match up to column names, no - # ambiguity about what to do - if all(lev in frame.columns.names for lev in level): - result = frame - for lev in level: - result = stack(result, lev, dropna=dropna) - - # Otherwise, level numbers may change as each successive level is stacked - elif all(isinstance(lev, int) for lev in level): - # As each stack is done, the level numbers decrease, so we need - # to account for that when level is a sequence of ints - result = frame - # _get_level_number() checks level numbers are in range and converts - # negative numbers to positive - level = [frame.columns._get_level_number(lev) for lev in level] - - # Can't iterate directly through level as we might need to change - # values as we go - for index in range(len(level)): - lev = level[index] - result = stack(result, lev, dropna=dropna) - # Decrement all level numbers greater than current, as these - # have now shifted down by one - updated_level = [] - for other in level: - if other > lev: - updated_level.append(other - 1) - else: - updated_level.append(other) - level = updated_level +def stack_levels_sequentially(frame, level_nums, dropna=True): + """ + Stack multiple levels of frame.columns -- which may be a MultiIndex or a simple Index -- sequentially. + """ + if isinstance(level_nums, int): + return stack_single_level(frame, level_nums, dropna=dropna) - else: - raise ValueError("level should contain all level names or all level numbers, " - "not a mixture of the two.") + result = frame + # Adjust level_nums to account for the fact that levels move "up" + # as a result of stacking of earlier levels. + adjusted_level_nums = [x - sum((y < x) for y in level_nums[:i]) + for i, x in enumerate(level_nums)] + for level_num in adjusted_level_nums: + result = stack_single_level(result, level_num, dropna=dropna) return result -def _stack_multi_columns(frame, level_num=-1, dropna=True): +def stack_multi_levels_simultaneously(frame, level_nums, dropna=True): + """ + Stack multiple levels of frame.columns -- which must be a MultiIndex -- simultaneously. + """ + def _convert_level_number(level_num, columns): """ Logic for converting the level number to something @@ -565,70 +574,39 @@ def _convert_level_number(level_num, columns): else: return columns.names[level_num] + if isinstance(level_nums, int): + level_nums = [level_nums] + this = frame.copy() # this makes life much simpler - if level_num != frame.columns.nlevels - 1: - # roll levels to put selected level at end - roll_columns = this.columns - for i in range(level_num, frame.columns.nlevels - 1): + # roll levels to put selected level(s) at end + roll_columns = this.columns + for j, level_num in enumerate(reversed(level_nums)): + for i in range(level_num, frame.columns.nlevels - (j + 1)): # Need to check if the ints conflict with level names lev1 = _convert_level_number(i, roll_columns) lev2 = _convert_level_number(i + 1, roll_columns) roll_columns = roll_columns.swaplevel(lev1, lev2) - this.columns = roll_columns + this.columns = roll_columns if not this.columns.is_lexsorted(): # Workaround the edge case where 0 is one of the column names, - # which interferes with trying to sort based on the first - # level + # which interferes with trying to sort based on the first level level_to_sort = _convert_level_number(0, this.columns) this = this.sortlevel(level_to_sort, axis=1) - # tuple list excluding level for grouping columns - if len(frame.columns.levels) > 2: - tuples = list(zip(*[ - lev.take(lab) for lev, lab in - zip(this.columns.levels[:-1], this.columns.labels[:-1]) - ])) - unique_groups = [key for key, _ in itertools.groupby(tuples)] - new_names = this.columns.names[:-1] - new_columns = MultiIndex.from_tuples(unique_groups, names=new_names) - else: - new_columns = unique_groups = this.columns.levels[0] - - # time to ravel the values - new_data = {} - level_vals = this.columns.levels[-1] - level_labels = sorted(set(this.columns.labels[-1])) - level_vals_used = level_vals[level_labels] + num_levels_to_stack = len(level_nums) + level_vals = this.columns.levels[-num_levels_to_stack:] + level_labels = sorted(set(zip(*this.columns.labels[-num_levels_to_stack:]))) + level_vals_used = MultiIndex.from_tuples([tuple(np.nan if lab == -1 else level_vals[i][lab] + for i, lab in enumerate(label)) + for label in level_labels], + names=this.columns.names[-num_levels_to_stack:]) levsize = len(level_labels) - drop_cols = [] - for key in unique_groups: - loc = this.columns.get_loc(key) - slice_len = loc.stop - loc.start - # can make more efficient? - - if slice_len == 0: - drop_cols.append(key) - continue - elif slice_len != levsize: - chunk = this.ix[:, this.columns[loc]] - chunk.columns = level_vals.take(chunk.columns.labels[-1]) - value_slice = chunk.reindex(columns=level_vals_used).values - else: - if frame._is_mixed_type: - value_slice = this.ix[:, this.columns[loc]].values - else: - value_slice = this.values[:, loc] - - new_data[key] = value_slice.ravel() - - if len(drop_cols) > 0: - new_columns = new_columns.difference(drop_cols) + # construct new_index N = len(this) - if isinstance(this.index, MultiIndex): new_levels = list(this.index.levels) new_names = list(this.index.names) @@ -637,15 +615,55 @@ def _convert_level_number(level_num, columns): new_levels = [this.index] new_labels = [np.arange(N).repeat(levsize)] new_names = [this.index.name] # something better? - - new_levels.append(frame.columns.levels[level_num]) - new_labels.append(np.tile(level_labels, N)) - new_names.append(frame.columns.names[level_num]) - + new_levels += level_vals + new_labels += [np.tile(labels, N) for labels in zip(*level_labels)] + new_names += level_vals_used.names new_index = MultiIndex(levels=new_levels, labels=new_labels, names=new_names, verify_integrity=False) - result = DataFrame(new_data, index=new_index, columns=new_columns) + # if stacking all levels in columns, result will be a Series + if len(this.columns.levels) == num_levels_to_stack: + new_data = this.values.ravel() + if dropna: + mask = notnull(new_data) + new_data = new_data[mask] + new_index = new_index[mask] + return Series(new_data, index=new_index) + + # result will be a DataFrame + + # construct new_columns + new_columns = this.columns.droplevel(list(range(this.columns.nlevels - num_levels_to_stack, + this.columns.nlevels)), True).drop_duplicates() + + # construct new_data + new_data = {} + unique_group_levels = this.columns.nlevels - num_levels_to_stack + unique_label_groups = unique(zip(*this.columns.labels[:unique_group_levels])) + + for i, unique_label_group in enumerate(unique_label_groups): + loc = np.array([True] * len(this.columns)) + for level_num in range(unique_group_levels): + loc &= (this.columns.labels[level_num] == unique_label_group[level_num]) + slice_len = loc.sum() + if slice_len != levsize: + chunk = this.iloc[:, loc] + chunk.columns = MultiIndex.from_arrays([_make_new_index(vals, labels) for vals, labels + in zip(level_vals, chunk.columns.labels[-num_levels_to_stack:])], + names=chunk.columns.names[-num_levels_to_stack:]) + value_slice = chunk.reindex(columns=level_vals_used).values + else: + if frame._is_mixed_type: + value_slice = this.iloc[:, loc].values + else: + value_slice = this.values[:, loc] + + new_data[i] = value_slice.ravel() + + # construct DataFrame with dummy columns, since construction from a dict + # doesn't handle NaNs correctly + result = DataFrame(new_data, index=new_index, columns=list(range(len(new_columns)))) + result.columns = new_columns # more efficient way to go about this? can do the whole masking biz but # will only save a small amount of time... diff --git a/pandas/core/series.py b/pandas/core/series.py index f4e3374626011..865ecce5d1fd8 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1983,7 +1983,9 @@ def unstack(self, level=-1): unstacked : DataFrame """ from pandas.core.reshape import unstack - return unstack(self, level) + + level_nums = self.index._get_level_numbers(level, allow_mixed_names_and_numbers=False) + return unstack(self, level_nums) #---------------------------------------------------------------------- # function application diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 6667d389bd6c5..222646c6db832 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -234,9 +234,9 @@ def test_setitem_mulit_index(self): ['left', 'center', 'right'] cols = MultiIndex.from_product(it) - index = pd.date_range('20141006',periods=20) + index = date_range('20141006',periods=20) vals = np.random.randint(1, 1000, (len(index), len(cols))) - df = pd.DataFrame(vals, columns=cols, index=index) + df = DataFrame(vals, columns=cols, index=index) i, j = df.index.values.copy(), it[-1][:] @@ -1996,7 +1996,7 @@ def verify(df, level, idx, indexer): right = df.iloc[indexer].set_index(icol) assert_frame_equal(left, right) - df = pd.DataFrame({'jim':list('B' * 4 + 'A' * 2 + 'C' * 3), + df = DataFrame({'jim':list('B' * 4 + 'A' * 2 + 'C' * 3), 'joe':list('abcdeabcd')[::-1], 'jolie':[10, 20, 30] * 3, 'joline': np.random.randint(0, 1000, 9)}) @@ -2045,7 +2045,7 @@ def verify(df, level, idx, indexer): verify(df, 'joe', ['3rd', '1st'], i) def test_getitem_ix_float_duplicates(self): - df = pd.DataFrame(np.random.randn(3, 3), + df = DataFrame(np.random.randn(3, 3), index=[0.1, 0.2, 0.2], columns=list('abc')) expect = df.iloc[1:] tm.assert_frame_equal(df.loc[0.2], expect) @@ -2062,7 +2062,7 @@ def test_getitem_ix_float_duplicates(self): expect = df.iloc[1:, 0] tm.assert_series_equal(df.loc[0.2, 'a'], expect) - df = pd.DataFrame(np.random.randn(4, 3), + df = DataFrame(np.random.randn(4, 3), index=[1, 0.2, 0.2, 1], columns=list('abc')) expect = df.iloc[1:-1] tm.assert_frame_equal(df.loc[0.2], expect) @@ -2081,14 +2081,14 @@ def test_getitem_ix_float_duplicates(self): def test_setitem_with_sparse_value(self): # GH8131 - df = pd.DataFrame({'c_1':['a', 'b', 'c'], 'n_1': [1., 2., 3.]}) - sp_series = pd.Series([0, 0, 1]).to_sparse(fill_value=0) + df = DataFrame({'c_1':['a', 'b', 'c'], 'n_1': [1., 2., 3.]}) + sp_series = Series([0, 0, 1]).to_sparse(fill_value=0) df['new_column'] = sp_series tm.assert_series_equal(df['new_column'], sp_series, check_names=False) def test_setitem_with_unaligned_sparse_value(self): - df = pd.DataFrame({'c_1':['a', 'b', 'c'], 'n_1': [1., 2., 3.]}) - sp_series = (pd.Series([0, 0, 1], index=[2, 1, 0]) + df = DataFrame({'c_1':['a', 'b', 'c'], 'n_1': [1., 2., 3.]}) + sp_series = (Series([0, 0, 1], index=[2, 1, 0]) .to_sparse(fill_value=0)) df['new_column'] = sp_series exp = pd.Series([1, 0, 0], name='new_column') @@ -2488,7 +2488,7 @@ def test_set_index_cast_datetimeindex(self): # don't cast a DatetimeIndex WITH a tz, leave as object # GH 6032 - i = pd.DatetimeIndex(pd.tseries.tools.to_datetime(['2013-1-1 13:00','2013-1-2 14:00'], errors="raise")).tz_localize('US/Pacific') + i = DatetimeIndex(pd.tseries.tools.to_datetime(['2013-1-1 13:00','2013-1-2 14:00'], errors="raise")).tz_localize('US/Pacific') df = DataFrame(np.random.randn(2,1),columns=['A']) expected = Series(np.array([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), @@ -2533,10 +2533,10 @@ def test_set_index_cast_datetimeindex(self): # GH 3950 # reset_index with single level for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern']: - idx = pd.date_range('1/1/2011', periods=5, freq='D', tz=tz, name='idx') - df = pd.DataFrame({'a': range(5), 'b': ['A', 'B', 'C', 'D', 'E']}, index=idx) + idx = date_range('1/1/2011', periods=5, freq='D', tz=tz, name='idx') + df = DataFrame({'a': range(5), 'b': ['A', 'B', 'C', 'D', 'E']}, index=idx) - expected = pd.DataFrame({'idx': [datetime(2011, 1, 1), datetime(2011, 1, 2), + expected = DataFrame({'idx': [datetime(2011, 1, 1), datetime(2011, 1, 2), datetime(2011, 1, 3), datetime(2011, 1, 4), datetime(2011, 1, 5)], 'a': range(5), 'b': ['A', 'B', 'C', 'D', 'E']}, @@ -2619,7 +2619,7 @@ def test_constructor_dtype_copy(self): 'col2': [2.], 'col3': [3.]}) - new_df = pd.DataFrame(orig_df, dtype=float, copy=True) + new_df = DataFrame(orig_df, dtype=float, copy=True) new_df['col1'] = 200. self.assertEqual(orig_df['col1'][0], 1.) @@ -3883,9 +3883,9 @@ def check(result, expected=None): # check column dups with index equal and not equal to df's index df = DataFrame(np.random.randn(5, 3), index=['a', 'b', 'c', 'd', 'e'], columns=['A', 'B', 'A']) - for index in [df.index, pd.Index(list('edcba'))]: + for index in [df.index, Index(list('edcba'))]: this_df = df.copy() - expected_ser = pd.Series(index.values, index=this_df.index) + expected_ser = Series(index.values, index=this_df.index) expected_df = DataFrame.from_items([('A', expected_ser), ('B', this_df['B']), ('A', expected_ser)]) @@ -4397,7 +4397,7 @@ def test_constructor_for_list_with_dtypes(self): assert_series_equal(result, expected) def test_not_hashable(self): - df = pd.DataFrame([1]) + df = DataFrame([1]) self.assertRaises(TypeError, hash, df) self.assertRaises(TypeError, hash, self.empty) @@ -7521,7 +7521,7 @@ def test_info_memory_usage(self): # excluded column with object dtype, so estimate is accurate self.assertFalse(re.match(r"memory usage: [^+]+\+", res[-1])) - df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo']) + df_with_object_index = DataFrame({'a': [1]}, index=['foo']) df_with_object_index.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() self.assertTrue(re.match(r"memory usage: [^+]+\+", res[-1])) @@ -7545,11 +7545,11 @@ def test_info_memory_usage(self): # test for validity DataFrame(1,index=['a'],columns=['A']).memory_usage(index=True) DataFrame(1,index=['a'],columns=['A']).index.nbytes - DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.nbytes - DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.values.nbytes - DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).memory_usage(index=True) - DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.nbytes - DataFrame(1,index=pd.MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.values.nbytes + DataFrame(1,index=MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.nbytes + DataFrame(1,index=MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.values.nbytes + DataFrame(1,index=MultiIndex.from_product([['a'],range(1000)]),columns=['A']).memory_usage(index=True) + DataFrame(1,index=MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.nbytes + DataFrame(1,index=MultiIndex.from_product([['a'],range(1000)]),columns=['A']).index.values.nbytes def test_dtypes(self): self.mixed_frame['bool'] = self.mixed_frame['A'] > 0 @@ -8706,14 +8706,14 @@ def test_drop(self): assert_frame_equal(nu_df.drop('a', axis=1), nu_df[['b']]) assert_frame_equal(nu_df.drop('b', axis='columns'), nu_df['a']) - nu_df = nu_df.set_index(pd.Index(['X', 'Y', 'X'])) + nu_df = nu_df.set_index(Index(['X', 'Y', 'X'])) nu_df.columns = list('abc') assert_frame_equal(nu_df.drop('X', axis='rows'), nu_df.ix[["Y"], :]) assert_frame_equal(nu_df.drop(['X', 'Y'], axis=0), nu_df.ix[[], :]) # inplace cache issue # GH 5628 - df = pd.DataFrame(np.random.randn(10,3), columns=list('abc')) + df = DataFrame(np.random.randn(10,3), columns=list('abc')) expected = df[~(df.b>0)] df.drop(labels=df[df.b>0].index, inplace=True) assert_frame_equal(df,expected) @@ -9404,7 +9404,7 @@ def test_regex_replace_dict_nested(self): assert_frame_equal(res4, expec) def test_regex_replace_dict_nested_gh4115(self): - df = pd.DataFrame({'Type':['Q','T','Q','Q','T'], 'tmp':2}) + df = DataFrame({'Type':['Q','T','Q','Q','T'], 'tmp':2}) expected = DataFrame({'Type': [0,1,0,0,1], 'tmp': 2}) assert_frame_equal(df.replace({'Type': {'Q':0,'T':1}}), expected) @@ -9845,14 +9845,14 @@ def test_replace_str_to_str_chain(self): df.replace({'a': dict(zip(astr, bstr))}) def test_replace_swapping_bug(self): - df = pd.DataFrame({'a': [True, False, True]}) + df = DataFrame({'a': [True, False, True]}) res = df.replace({'a': {True: 'Y', False: 'N'}}) - expect = pd.DataFrame({'a': ['Y', 'N', 'Y']}) + expect = DataFrame({'a': ['Y', 'N', 'Y']}) tm.assert_frame_equal(res, expect) - df = pd.DataFrame({'a': [0, 1, 0]}) + df = DataFrame({'a': [0, 1, 0]}) res = df.replace({'a': {0: 'Y', 1: 'N'}}) - expect = pd.DataFrame({'a': ['Y', 'N', 'Y']}) + expect = DataFrame({'a': ['Y', 'N', 'Y']}) tm.assert_frame_equal(res, expect) def test_replace_period(self): @@ -9865,7 +9865,7 @@ def test_replace_period(self): 'out_augmented_MAY_2011.json': pd.Period(year=2011, month=5, freq='M'), 'out_augmented_SEP_2013.json': pd.Period(year=2013, month=9, freq='M')}} - df = pd.DataFrame(['out_augmented_AUG_2012.json', + df = DataFrame(['out_augmented_AUG_2012.json', 'out_augmented_SEP_2013.json', 'out_augmented_SUBSIDY_WEEK.json', 'out_augmented_MAY_2012.json', @@ -9888,7 +9888,7 @@ def test_replace_datetime(self): 'out_augmented_MAY_2011.json': pd.Timestamp('2011-05'), 'out_augmented_SEP_2013.json': pd.Timestamp('2013-09')}} - df = pd.DataFrame(['out_augmented_AUG_2012.json', + df = DataFrame(['out_augmented_AUG_2012.json', 'out_augmented_SEP_2013.json', 'out_augmented_SUBSIDY_WEEK.json', 'out_augmented_MAY_2012.json', @@ -11562,7 +11562,7 @@ def test_apply_bug(self): # GH 6125 import datetime - positions = pd.DataFrame([[1, 'ABC0', 50], [1, 'YUM0', 20], + positions = DataFrame([[1, 'ABC0', 50], [1, 'YUM0', 20], [1, 'DEF0', 20], [2, 'ABC1', 50], [2, 'YUM1', 20], [2, 'DEF1', 20]], columns=['a', 'market', 'position']) @@ -13055,7 +13055,7 @@ def wrapper(x): self.assertTrue(np.isnan(r1).all()) def test_mode(self): - df = pd.DataFrame({"A": [12, 12, 11, 12, 19, 11], + df = DataFrame({"A": [12, 12, 11, 12, 19, 11], "B": [10, 10, 10, np.nan, 3, 4], "C": [8, 8, 8, 9, 9, 9], "D": np.arange(6,dtype='int64'), @@ -13067,9 +13067,9 @@ def test_mode(self): expected = pd.Series([1, 3, 8], dtype='int64', name='E').to_frame() assert_frame_equal(df[["E"]].mode(), expected) assert_frame_equal(df[["A", "B"]].mode(), - pd.DataFrame({"A": [12], "B": [10.]})) + DataFrame({"A": [12], "B": [10.]})) assert_frame_equal(df.mode(), - pd.DataFrame({"A": [12, np.nan, np.nan], + DataFrame({"A": [12, np.nan, np.nan], "B": [10, np.nan, np.nan], "C": [8, 9, np.nan], "D": [np.nan, np.nan, np.nan], @@ -13080,7 +13080,7 @@ def test_mode(self): com.pprint_thing(df["C"]) com.pprint_thing(df["C"].mode()) a, b = (df[["A", "B", "C"]].mode(), - pd.DataFrame({"A": [12, np.nan], + DataFrame({"A": [12, np.nan], "B": [10, np.nan], "C": [8, 9]})) com.pprint_thing(a) @@ -13090,18 +13090,18 @@ def test_mode(self): df = pd.DataFrame({"A": np.arange(6,dtype='int64'), "B": pd.date_range('2011', periods=6), "C": list('abcdef')}) - exp = pd.DataFrame({"A": pd.Series([], dtype=df["A"].dtype), - "B": pd.Series([], dtype=df["B"].dtype), - "C": pd.Series([], dtype=df["C"].dtype)}) + exp = DataFrame({"A": Series([], dtype=df["A"].dtype), + "B": Series([], dtype=df["B"].dtype), + "C": Series([], dtype=df["C"].dtype)}) assert_frame_equal(df.mode(), exp) # and also when not empty df.loc[1, "A"] = 0 df.loc[4, "B"] = df.loc[3, "B"] df.loc[5, "C"] = 'e' - exp = pd.DataFrame({"A": pd.Series([0], dtype=df["A"].dtype), - "B": pd.Series([df.loc[3, "B"]], dtype=df["B"].dtype), - "C": pd.Series(['e'], dtype=df["C"].dtype)}) + exp = DataFrame({"A": Series([0], dtype=df["A"].dtype), + "B": Series([df.loc[3, "B"]], dtype=df["B"].dtype), + "C": Series(['e'], dtype=df["C"].dtype)}) assert_frame_equal(df.mode(), exp) @@ -13668,6 +13668,13 @@ def test_stack_ints(self): list(itertools.product(range(3), repeat=3)) ) ) + + for level in (2, 1, 0, [0, 1], [0, 2], [1, 2], [1, 0], [2, 0], [2, 1]): + np.testing.assert_equal(df.stack(level=level).size, + df.size) + np.testing.assert_almost_equal(df.stack(level=level).sum().sum(), + df.sum().sum()) + assert_frame_equal( df.stack(level=[1, 2]), df.stack(level=1).stack(level=1) @@ -13811,7 +13818,6 @@ def test_unstack_to_series(self): assert_frame_equal(old_data, data) def test_unstack_dtypes(self): - # GH 2929 rows = [[1, 1, 3, 4], [1, 2, 3, 4], @@ -13849,7 +13855,7 @@ def test_unstack_dtypes(self): (np.arange(5, dtype='f8'), np.arange(5, 10, dtype='f8')): df = DataFrame({'A': ['a']*5, 'C':c, 'D':d, - 'B':pd.date_range('2012-01-01', periods=5)}) + 'B':date_range('2012-01-01', periods=5)}) right = df.iloc[:3].copy(deep=True) @@ -13873,7 +13879,8 @@ def test_unstack_non_unique_index_names(self): with tm.assertRaises(ValueError): df.T.stack('c1') - def test_unstack_nan_index(self): # GH7466 + def test_unstack_nan_index(self): + # GH7466 cast = lambda val: '{0:1}'.format('' if val != val else val) nan = np.nan @@ -13881,7 +13888,7 @@ def verify(df): mk_list = lambda a: list(a) if isinstance(a, tuple) else [a] rows, cols = df.notnull().values.nonzero() for i, j in zip(rows, cols): - left = sorted(df.iloc[i, j].split('.')) + left = sorted(df.iloc[i, :].iloc[j].split('.')) right = mk_list(df.index[i]) + mk_list(df.columns[j]) right = sorted(list(map(cast, right))) self.assertEqual(left, right) @@ -13921,7 +13928,7 @@ def verify(df): verify(udf[col]) # GH7403 - df = pd.DataFrame({'A': list('aaaabbbb'),'B':range(8), 'C':range(8)}) + df = DataFrame({'A': list('aaaabbbb'),'B':range(8), 'C':range(8)}) df.iloc[3, 1] = np.NaN left = df.set_index(['A', 'B']).unstack(0) @@ -13949,7 +13956,7 @@ def verify(df): right = DataFrame(vals, columns=cols, index=idx) assert_frame_equal(left, right) - df = pd.DataFrame({'A': list('aaaabbbb'),'B':list(range(4))*2, + df = DataFrame({'A': list('aaaabbbb'),'B':list(range(4))*2, 'C':range(8)}) df.iloc[3,1] = np.NaN left = df.set_index(['A', 'B']).unstack(0) @@ -13963,7 +13970,7 @@ def verify(df): assert_frame_equal(left, right) # GH7401 - df = pd.DataFrame({'A': list('aaaaabbbbb'), 'C':np.arange(10), + df = DataFrame({'A': list('aaaaabbbbb'), 'C':np.arange(10), 'B':date_range('2012-01-01', periods=5).tolist()*2 }) df.iloc[3,1] = np.NaN @@ -13976,6 +13983,8 @@ def verify(df): names=[None, 'B']) right = DataFrame(vals, columns=cols, index=idx) + for i in [1, 2, 3, 5]: + right.iloc[:, i] = right.iloc[:, i].astype(df.dtypes['C']) assert_frame_equal(left, right) # GH4862 @@ -14086,6 +14095,403 @@ def _test_stack_with_multiindex(multiindex): dtype=df.dtypes[0]) assert_frame_equal(result, expected) + def test_stack_multi(self): + # GH 8851 + + df_nonan = DataFrame(np.arange(2*6).reshape(2,6), + columns=MultiIndex.from_tuples([('A','a','X'), ('A','a','Y'), + ('A','b','X'), ('B','a','Z'), + ('B','b','Y'), ('B','b','X')], + names=['ABC','abc','XYZ']), + dtype=np.float64) + # ABC A B + # abc a b a b + # XYZ X Y X Z Y X + # 0 0 1 2 3 4 5 + # 1 6 7 8 9 10 11 + + df_nan = df_nonan.copy() + df_nan.iloc[0, 1] = nan + df_nan.iloc[0, 4] = nan + # ABC A B + # abc a b a b + # XYZ X Y X Z Y X + # 0 0 NaN 2 3 NaN 5 + # 1 6 7 8 9 10 11 + + # check consistency of the following calls for any single level n + # stack(level=n, sequentially=True) + # stack(level=n, sequentially=False) + # stack(level=[n], sequentially=True) + # stack(level=[n], sequentially=False) + for df in (df_nonan, df_nan): + for lev in (-1, 0, 1, 2, 'ABC', 'abc', 'XYZ'): + for dropna in (True, False): + expected = None + for level in (lev, [lev]): + for sequentially in (True, False): + result = df.stack(level=level, dropna=dropna, sequentially=sequentially) + if expected is None: + expected = result + else: + assert_frame_equal(result, expected) + + # check that result of stacking a single level is as expected + result = df_nonan.stack(level=0, dropna=False) + expected = DataFrame([[0, 1, None, 2, None], + [None, None, 3, 5, 4], + [6, 7, None, 8, None], + [None, None, 9, 11, 10]], + index=MultiIndex.from_tuples([(0,'A'), (0,'B'), + (1,'A'), (1,'B')], + names=[None, 'ABC']), + columns=MultiIndex.from_tuples([('a','X'), ('a','Y'), ('a','Z'), + ('b','X'), ('b','Y')], + names=['abc', 'XYZ']), + dtype=np.float64) + # abc a b + # XYZ X Y Z X Y + # ABC + # 0 A 0 1 NaN 2 NaN + # B NaN NaN 3 5 4 + # 1 A 6 7 NaN 8 NaN + # B NaN NaN 9 11 10 + assert_frame_equal(result, expected) + + # when dropna=False, missing values should not affect shape of result + result = df_nan.stack(level=0, dropna=False) + expected = expected.replace(1, nan).replace(4, nan) + assert_frame_equal(result, expected) + + # dropna=True has the effect of dropping all empty rows in the result + result = df_nan.stack(level=0, dropna=True) + expected.dropna(axis=0, how='all', inplace=True) + assert_frame_equal(result, expected) + + # check that result of stacking two levels simultaneously is as expected + result = df_nonan.stack(level=[0, 2], dropna=False, sequentially=False) + expected = DataFrame([[0, 2], + [1, None], + [None, 5], + [None, 4], + [3, None], + [6, 8], + [7, None], + [None, 11], + [None, 10], + [9, None]], + index=MultiIndex.from_tuples([(0,'A','X'), (0,'A','Y'), + (0,'B','X'), (0,'B','Y'), (0,'B','Z'), + (1,'A','X'), (1,'A','Y'), + (1,'B','X'), (1,'B','Y'), (1,'B','Z')], + names=[None, 'ABC', 'XYZ']), + columns=Index(['a', 'b'], name='abc'), + dtype=np.float64) + # abc a b + # ABC XYZ + # 0 A X 0 2 + # Y 1 NaN + # B X NaN 5 + # Y NaN 4 + # Z 3 NaN + # 1 A X 6 8 + # Y 7 NaN + # B X NaN 11 + # Y NaN 10 + # Z 9 NaN + assert_frame_equal(result, expected) + + # when sequentially=False and the DataFrame has no missing values, the value of dropna shouldn't matter + result = df_nonan.stack(level=[0, 2], dropna=True, sequentially=False) + assert_frame_equal(result, expected) + + # when dropna=True, the value of sequentially shouldn't matter + result = df_nonan.stack(level=[0, 2], dropna=True, sequentially=True) + assert_frame_equal(result, expected) + + # when dropna=False and sequentially=False, missing values don't affect the shape of the result + result = df_nan.stack(level=[0, 2], dropna=False, sequentially=False) + expected = expected.replace(1, nan).replace(4, nan) + assert_frame_equal(result, expected) + + # dropna=True has the effect of dropping all empty rows in the result + result = df_nan.stack(level=[0, 2], dropna=True, sequentially=False) + expected.dropna(axis=0, how='all', inplace=True) + assert_frame_equal(result, expected) + + # when dropna=True, the value of sequentially shouldn't matter + result = df_nan.stack(level=[0, 2], dropna=True, sequentially=True) + assert_frame_equal(result, expected) + + def test_stack_and_unstack_all_product_levels(self): + # GH 8851 + + for index in (Index([0, 1]), + MultiIndex.from_tuples([(0, 100), (1, 101)], + names=[None, 'Hundred'])): + pass + + df = DataFrame(np.arange(2 * 3).reshape((2, 3)), + columns=Index(['x', 'y', 'z'], name='Lower'), + dtype=np.float64) + # Lower x y z + # 0 0 1 2 + # 1 3 4 5 + + # stacking with any parameters should produce the following: + expected = Series(np.arange(2 * 3), + index=MultiIndex.from_product([[0, 1], ['x', 'y', 'z']], + names=[None, 'Lower']), + dtype=np.float64) + # Lower + # 0 x 0 + # y 1 + # z 2 + # 1 x 3 + # y 4 + # z 5 + for level in (-1, 0, [0], None): + for dropna in (True, False): + for sequentially in (True, False): + result = df.stack(level=level, dropna=dropna, sequentially=sequentially) + assert_series_equal(result, expected) + result = df.T.unstack(level=level, dropna=dropna, sequentially=sequentially) + assert_series_equal(result, expected) + + df = DataFrame(np.arange(2 * 4).reshape((2, 4)), + columns=MultiIndex.from_product([['A', 'B'], ['x', 'y']], + names=['Upper', 'Lower']), + dtype=np.float64) + # Upper A B + # Lower x y x y + # 0 0 1 2 3 + # 1 4 5 6 7 + + # stacking all column levels in order should produce the following: + expected = Series(np.arange(2 * 4), + index=MultiIndex.from_product([[0, 1], ['A', 'B'], ['x', 'y']], + names=[None, 'Upper', 'Lower']), + dtype=np.float64) + # Upper Lower + # 0 A x 0 + # y 1 + # B x 2 + # y 3 + # 1 A x 4 + # y 5 + # B x 6 + # y 7 + # dtype: float64 + for level in ([0, 1], None): + for dropna in (True, False): + for sequentially in (True, False): + result = df.stack(level=level, dropna=dropna, sequentially=sequentially) + assert_series_equal(result, expected) + result = df.T.unstack(level=level, dropna=dropna, sequentially=sequentially) + assert_series_equal(result, expected) + + # stacking all column levels in reverse order should produce the following: + expected = Series([0, 2, 1, 3, 4, 6, 5, 7], + index=MultiIndex.from_product([[0, 1], ['x', 'y'], ['A', 'B']], + names=[None, 'Lower', 'Upper']), + dtype=np.float64) + # Lower Upper + # 0 x A 0 + # B 2 + # y A 1 + # B 3 + # 1 x A 4 + # B 6 + # y A 5 + # B 7 + # dtype: float64 + for dropna in (True, False): + for sequentially in (True, False): + result = df.stack(level=[1, 0], dropna=dropna, sequentially=sequentially) + assert_series_equal(result, expected) + if sequentially: + # DataFrame.unstack() does not properly sort list levels; see GH 9514 + result = df.T.unstack(level=[1, 0], dropna=dropna, sequentially=sequentially) + assert_series_equal(result, expected) + + def test_stack_all_levels_multiindex_columns(self): + # GH 8851 + + df = DataFrame(np.arange(2 * 3).reshape((2, 3)), + columns=MultiIndex.from_tuples([('A','x'), ('A','y'), ('B','z')], + names=['Upper', 'Lower']), + dtype=np.float64) + # Upper A B + # Lower x y z + # 0 0 1 2 + # 1 3 4 5 + + # stacking all column levels with sequentially=False should produce the following: + expected = Series(np.arange(2 * 3), + index=MultiIndex.from_tuples([(0,'A','x'), (0,'A','y'), (0,'B','z'), + (1,'A','x'), (1,'A','y'), (1,'B','z')], + names=[None, 'Upper', 'Lower']), + dtype=np.float64) + # Upper Lower + # 0 A x 0 + # y 1 + # B z 2 + # 1 A x 3 + # y 4 + # B z 5 + + # switching order of levels should correspond to swapping levels of result + expected_swapped = expected.copy() + expected_swapped.index = expected.index.swaplevel(1, 2) + + for dropna in (True, False): + for level in ([0, 1], [0, -1], None): + result = df.stack(level=level, dropna=dropna, sequentially=False) + assert_series_equal(result, expected) + + for level in ([1, 0], [-1, 0]): + result = df.stack(level=level, dropna=dropna, sequentially=False) + assert_series_equal(result, expected_swapped) + + # since df has no missing values, should get same result with dropna=True and sequentially=True + result = df.stack(level=[0, 1], dropna=True, sequentially=True) + assert_series_equal(result, expected) + + # stacking all column levels with dropna=False and sequentially=True + expected = Series([0, 1, None, None, None, 2, + 3, 4, None, None, None, 5], + index=MultiIndex.from_tuples([(0,'A','x'), (0,'A','y'), (0,'A','z'), + (0,'B','x'), (0,'B','y'), (0,'B','z'), + (1,'A','x'), (1,'A','y'), (1,'A','z'), + (1,'B','x'), (1,'B','y'), (1,'B','z')], + names=[None, 'Upper', 'Lower']), + dtype=np.float64) + # Upper Lower + # 0 A x 0 + # y 1 + # z NaN + # B x NaN + # y NaN + # z 2 + # 1 A x 3 + # y 4 + # z NaN + # B x NaN + # y NaN + # z 5 + + for level in ([0, 1], [0, -1], None): + result = df.stack(level=level, dropna=False, sequentially=True) + assert_series_equal(result, expected) + + # check that this is indeed the result of stacking levels sequentially + result = df.stack(level=0, dropna=False).stack(level=0, dropna=False) + assert_series_equal(result, expected) + + def test_stack_nan_index(self): + # GH 9406 + df = DataFrame({'A': list('aaaabbbb'),'B':range(8), 'C':range(8)}) + df.iloc[3, 1] = np.NaN + dfs = df.set_index(['A', 'B']).T + + result = dfs.stack(0) + data0 = [[3, 0, 1, 2, nan, nan, nan, nan], + [nan, nan, nan, nan, 4, 5, 6, 7]] + cols = Index([nan, 0, 1, 2, 4, 5, 6, 7], name='B') + idx = MultiIndex(levels=[['C'], ['a', 'b']], + labels=[[0, 0], [0, 1]], + names=[None, 'A']) + expected = DataFrame(data0, index=idx, columns=cols) + assert_frame_equal(result, expected) + + result = dfs.stack([0, 1], dropna=False, sequentially=True) + data = [x for y in data0 for x in y] + idx = MultiIndex(levels=[['C'], ['a', 'b'], [0., 1., 2., 4., 5., 6., 7.]], + labels=[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], + [-1, 0, 1, 2, 3, 4, 5, 6, -1, 0, 1, 2, 3, 4, 5, 6]], + names=[None, 'A', 'B']) + expected = Series(data, index=idx) + assert_series_equal(result, expected) + + result = dfs.stack([0, 1], dropna=False, sequentially=False) + data = [3, 0, 1, 2, 4, 5, 6, 7] + idx = MultiIndex(levels=[['C'], ['a', 'b'], [0, 1, 2, 4, 5, 6, 7]], + labels=[[0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 1, 1, 1, 1], + [-1, 0, 1, 2, 3, 4, 5, 6]], + names=[None, 'A', 'B']) + expected = Series(data, index=idx, dtype=dfs.dtypes[0]) + assert_series_equal(result, expected) + + result = dfs.stack(1, dropna=False) + data1 = [list(tuple) for tuple in zip(*data0)] # transpose + cols = Index(['a', 'b'], name='A') + idx = MultiIndex(levels=[['C'], [0, 1, 2, 4, 5, 6, 7]], + labels=[[0, 0, 0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4, 5, 6]], + names=[None, 'B']) + expected = DataFrame(data1, index=idx, columns=cols) + assert_frame_equal(result, expected) + + result = dfs.stack([1, 0], dropna=False, sequentially=True) + data = [x for y in data1 for x in y] + idx = MultiIndex(levels=[['C'], [0, 1, 2, 4, 5, 6, 7], ['a', 'b']], + labels=[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [-1, -1, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6], + [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]], + names=[None, 'B', 'A']) + expected = Series(data, index=idx) + assert_series_equal(result, expected) + + result = dfs.stack([1, 0], dropna=False, sequentially=False) + idx = MultiIndex(levels=[['C'], [0, 1, 2, 4, 5, 6, 7], ['a', 'b']], + labels=[[0, 0, 0, 0, 0, 0, 0, 0], + [-1, 0, 1, 2, 3, 4, 5, 6], + [0, 0, 0, 0, 1, 1, 1, 1]], + names=[None, 'B', 'A']) + data = [3, 0, 1, 2, 4, 5, 6, 7] + expected = Series(data, index=idx, dtype=dfs.dtypes[0]) + assert_series_equal(result, expected) + + df_nan = DataFrame(np.arange(4).reshape(2, 2), + columns=MultiIndex.from_tuples([('A', np.nan), ('B', 'b')], + names=['Upper', 'Lower']), + index=Index([0, 1], name='Num'), + dtype=np.float64) + df_nonan = DataFrame(np.arange(4).reshape(2, 2), + columns=MultiIndex.from_tuples([('A', 'a'), ('B', 'b')], + names=['Upper', 'Lower']), + index=Index([0, 1], name='Num'), + dtype=np.float64) + for level in (0, 1, None, [1, 0]): + for dropna in (True, False): + for sequentially in (True, False): + result_nan = df_nan.stack(level, dropna=dropna, sequentially=sequentially) + result_nonan = df_nonan.stack(level, dropna=dropna, sequentially=sequentially) + assert_almost_equal(result_nan.values, result_nonan.values) + if level == 1: + tm.assert_index_equal(result_nan.columns, result_nonan.columns) + elif level == 0: + tm.assert_index_equal(result_nan.index, result_nonan.index) + + df = DataFrame([[11, 22], [33, 44]], + columns=MultiIndex.from_tuples([(1, 'a'), (None, 'b')], + names=['ints', 'letters'])) + + result = df.stack(0) + expected = DataFrame([[None, 22], [11, None], [None, 44], [33, None]], + columns=Index(['a', 'b'], name='letters'), + index=MultiIndex.from_product([[0, 1], [None, 1]], + names=[None, 'ints'])) + tm.assert_frame_equal(result, expected) + + result = df.stack(1) + expected = DataFrame([[None, 11], [22, None], [None, 33], [44, None]], + columns=Index([nan, 1], name='ints'), + index=MultiIndex.from_product([[0, 1], ['a', 'b']], + names=[None, 'letters'])) + tm.assert_frame_equal(result, expected) + def test_repr_with_mi_nat(self): df = DataFrame({'X': [1, 2]}, index=[[pd.NaT, pd.Timestamp('20130101')], ['a', 'b']]) @@ -14224,12 +14630,12 @@ def test_reset_index_multiindex_col(self): def test_reset_index_with_datetimeindex_cols(self): # GH5818 # - df = pd.DataFrame([[1, 2], [3, 4]], - columns=pd.date_range('1/1/2013', '1/2/2013'), + df = DataFrame([[1, 2], [3, 4]], + columns=date_range('1/1/2013', '1/2/2013'), index=['A', 'B']) result = df.reset_index() - expected = pd.DataFrame([['A', 1, 2], ['B', 3, 4]], + expected = DataFrame([['A', 1, 2], ['B', 3, 4]], columns=['index', datetime(2013, 1, 1), datetime(2013, 1, 2)]) assert_frame_equal(result, expected) @@ -14909,8 +15315,8 @@ def test_consolidate_datetime64(self): df.starting = ser_starting.index df.ending = ser_ending.index - tm.assert_index_equal(pd.DatetimeIndex(df.starting), ser_starting.index) - tm.assert_index_equal(pd.DatetimeIndex(df.ending), ser_ending.index) + tm.assert_index_equal(DatetimeIndex(df.starting), ser_starting.index) + tm.assert_index_equal(DatetimeIndex(df.ending), ser_ending.index) def _check_bool_op(self, name, alternative, frame=None, has_skipna=True, has_bool_only=False): @@ -15090,7 +15496,7 @@ def test_isin(self): def test_isin_empty(self): df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']}) result = df.isin([]) - expected = pd.DataFrame(False, df.index, df.columns) + expected = DataFrame(False, df.index, df.columns) assert_frame_equal(result, expected) def test_isin_dict(self): @@ -15166,9 +15572,9 @@ def test_isin_dupe_self(self): assert_frame_equal(result, expected) def test_isin_against_series(self): - df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]}, + df = DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]}, index=['a', 'b', 'c', 'd']) - s = pd.Series([1, 3, 11, 4], index=['a', 'b', 'c', 'd']) + s = Series([1, 3, 11, 4], index=['a', 'b', 'c', 'd']) expected = DataFrame(False, index=df.index, columns=df.columns) expected['A'].loc['a'] = True expected.loc['d'] = True @@ -15272,50 +15678,50 @@ def test_concat_empty_dataframe_dtypes(self): self.assertEqual(result['c'].dtype, np.float64) def test_empty_frame_dtypes_ftypes(self): - empty_df = pd.DataFrame() - assert_series_equal(empty_df.dtypes, pd.Series(dtype=np.object)) - assert_series_equal(empty_df.ftypes, pd.Series(dtype=np.object)) + empty_df = DataFrame() + assert_series_equal(empty_df.dtypes, Series(dtype=np.object)) + assert_series_equal(empty_df.ftypes, Series(dtype=np.object)) - nocols_df = pd.DataFrame(index=[1,2,3]) - assert_series_equal(nocols_df.dtypes, pd.Series(dtype=np.object)) - assert_series_equal(nocols_df.ftypes, pd.Series(dtype=np.object)) + nocols_df = DataFrame(index=[1,2,3]) + assert_series_equal(nocols_df.dtypes, Series(dtype=np.object)) + assert_series_equal(nocols_df.ftypes, Series(dtype=np.object)) - norows_df = pd.DataFrame(columns=list("abc")) - assert_series_equal(norows_df.dtypes, pd.Series(np.object, index=list("abc"))) - assert_series_equal(norows_df.ftypes, pd.Series('object:dense', index=list("abc"))) + norows_df = DataFrame(columns=list("abc")) + assert_series_equal(norows_df.dtypes, Series(np.object, index=list("abc"))) + assert_series_equal(norows_df.ftypes, Series('object:dense', index=list("abc"))) - norows_int_df = pd.DataFrame(columns=list("abc")).astype(np.int32) - assert_series_equal(norows_int_df.dtypes, pd.Series(np.dtype('int32'), index=list("abc"))) - assert_series_equal(norows_int_df.ftypes, pd.Series('int32:dense', index=list("abc"))) + norows_int_df = DataFrame(columns=list("abc")).astype(np.int32) + assert_series_equal(norows_int_df.dtypes, Series(np.dtype('int32'), index=list("abc"))) + assert_series_equal(norows_int_df.ftypes, Series('int32:dense', index=list("abc"))) odict = OrderedDict - df = pd.DataFrame(odict([('a', 1), ('b', True), ('c', 1.0)]), index=[1, 2, 3]) - assert_series_equal(df.dtypes, pd.Series(odict([('a', np.int64), + df = DataFrame(odict([('a', 1), ('b', True), ('c', 1.0)]), index=[1, 2, 3]) + assert_series_equal(df.dtypes, Series(odict([('a', np.int64), ('b', np.bool), ('c', np.float64)]))) - assert_series_equal(df.ftypes, pd.Series(odict([('a', 'int64:dense'), + assert_series_equal(df.ftypes, Series(odict([('a', 'int64:dense'), ('b', 'bool:dense'), ('c', 'float64:dense')]))) # same but for empty slice of df - assert_series_equal(df[:0].dtypes, pd.Series(odict([('a', np.int64), + assert_series_equal(df[:0].dtypes, Series(odict([('a', np.int64), ('b', np.bool), ('c', np.float64)]))) - assert_series_equal(df[:0].ftypes, pd.Series(odict([('a', 'int64:dense'), + assert_series_equal(df[:0].ftypes, Series(odict([('a', 'int64:dense'), ('b', 'bool:dense'), ('c', 'float64:dense')]))) def test_dtypes_are_correct_after_column_slice(self): # GH6525 - df = pd.DataFrame(index=range(5), columns=list("abc"), dtype=np.float_) + df = DataFrame(index=range(5), columns=list("abc"), dtype=np.float_) odict = OrderedDict assert_series_equal(df.dtypes, - pd.Series(odict([('a', np.float_), ('b', np.float_), + Series(odict([('a', np.float_), ('b', np.float_), ('c', np.float_),]))) assert_series_equal(df.iloc[:,2:].dtypes, - pd.Series(odict([('c', np.float_)]))) + Series(odict([('c', np.float_)]))) assert_series_equal(df.dtypes, - pd.Series(odict([('a', np.float_), ('b', np.float_), + Series(odict([('a', np.float_), ('b', np.float_), ('c', np.float_),]))) def test_set_index_names(self): @@ -15376,7 +15782,7 @@ def test_select_dtypes_exclude_include(self): 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], - 'f': pd.date_range('now', periods=3).values}) + 'f': date_range('now', periods=3).values}) exclude = np.datetime64, include = np.bool_, 'integer' r = df.select_dtypes(include=include, exclude=exclude) @@ -15395,7 +15801,7 @@ def test_select_dtypes_not_an_attr_but_still_valid_dtype(self): 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], - 'f': pd.date_range('now', periods=3).values}) + 'f': date_range('now', periods=3).values}) df['g'] = df.f.diff() assert not hasattr(np, 'u8') r = df.select_dtypes(include=['i8', 'O'], exclude=['timedelta']) @@ -15427,7 +15833,7 @@ def test_select_dtypes_bad_datetime64(self): 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], - 'f': pd.date_range('now', periods=3).values}) + 'f': date_range('now', periods=3).values}) with tm.assertRaisesRegexp(ValueError, '.+ is too specific'): df.select_dtypes(include=['datetime64[D]']) @@ -15441,7 +15847,7 @@ def test_select_dtypes_str_raises(self): 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], - 'f': pd.date_range('now', periods=3).values}) + 'f': date_range('now', periods=3).values}) string_dtypes = set((str, 'str', np.string_, 'S1', 'unicode', np.unicode_, 'U1')) try: @@ -15463,7 +15869,7 @@ def test_select_dtypes_bad_arg_raises(self): 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], - 'f': pd.date_range('now', periods=3).values}) + 'f': date_range('now', periods=3).values}) with tm.assertRaisesRegexp(TypeError, 'data type.*not understood'): df.select_dtypes(['blargy, blarg, blarg']) @@ -16531,7 +16937,7 @@ def test_query_single_element_booleans(self): def check_query_string_scalar_variable(self, parser, engine): tm.skip_if_no_ne(engine) - df = pd.DataFrame({'Symbol': ['BUD US', 'BUD US', 'IBM US', 'IBM US'], + df = DataFrame({'Symbol': ['BUD US', 'BUD US', 'IBM US', 'IBM US'], 'Price': [109.70, 109.72, 183.30, 183.35]}) e = df[df.Symbol == 'BUD US'] symb = 'BUD US' diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index f7d93a978a46a..c947481c8a87c 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -4257,8 +4257,11 @@ def test_changing_names(self): self.check_level_names(self.index, new_names) def test_duplicate_names(self): + # GH 9399 self.index.names = ['foo', 'foo'] - assertRaisesRegexp(KeyError, 'Level foo not found', + assertRaisesRegexp(KeyError, 'Level bar not found', + self.index._get_level_number, 'bar') + assertRaisesRegexp(ValueError, 'The name foo occurs multiple times, use a level number', self.index._get_level_number, 'foo') def test_get_level_number_integer(self): @@ -4419,7 +4422,6 @@ def test_legacy_pickle(self): assert_almost_equal(exp, exp2) def test_legacy_v2_unpickle(self): - # 0.7.3 -> 0.8.0 format manage path = tm.get_data_path('mindex_073.pickle') obj = pd.read_pickle(path) @@ -4438,7 +4440,6 @@ def test_legacy_v2_unpickle(self): assert_almost_equal(exp, exp2) def test_roundtrip_pickle_with_tz(self): - # GH 8367 # round-trip of timezone index=MultiIndex.from_product([[1,2],['a','b'],date_range('20130101',periods=3,tz='US/Eastern')],names=['one','two','three']) diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index 50ae574c03067..e2e17dcb7d115 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -431,6 +431,7 @@ def test_pivot_timegrouper(self): columns='Carl Joe Mark'.split()) expected.index.name = 'Date' expected.columns.name = 'Buyer' + expected['Carl'] = expected['Carl'].astype(df.dtypes['Quantity']) result = pivot_table(df, index=Grouper(freq='6MS'), columns='Buyer', values='Quantity', aggfunc=np.sum)