From fb1aed61435bbd04b26d12377fdfdc5b027b6e08 Mon Sep 17 00:00:00 2001 From: immerrr Date: Thu, 13 Mar 2014 00:09:15 +0400 Subject: [PATCH 1/2] CLN: remove items/ref_items attributes from Block class Now, there's one authoritative mapping from BlockManager axis0 to axis0 of subordinate blocks: * _blknos[loc] is block number where loc-th item resides * _blklocs[loc] is location in given block of loc-th item Blocks support this mapping with blk._ref_locs attributes that enumerate all items contained in that block. --- pandas/computation/tests/test_eval.py | 0 pandas/core/format.py | 8 +- pandas/core/frame.py | 45 +- pandas/core/generic.py | 49 +- pandas/core/groupby.py | 44 +- pandas/core/internals.py | 2887 ++++++++++++------------- pandas/core/reshape.py | 20 +- pandas/io/packers.py | 11 +- pandas/io/pytables.py | 68 +- pandas/io/tests/test_pickle.py | 1 - pandas/io/tests/test_pytables.py | 1 - pandas/sparse/series.py | 2 +- pandas/sparse/tests/test_sparse.py | 2 +- pandas/tests/test_frame.py | 18 +- pandas/tests/test_internals.py | 564 ++--- pandas/tests/test_multilevel.py | 2 +- pandas/tools/merge.py | 560 +---- pandas/tools/tests/test_merge.py | 18 +- pandas/tseries/resample.py | 3 +- 19 files changed, 1926 insertions(+), 2377 deletions(-) mode change 100644 => 100755 pandas/computation/tests/test_eval.py diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py old mode 100644 new mode 100755 diff --git a/pandas/core/format.py b/pandas/core/format.py index 6d0b0596d08d2..c76693e16494f 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -1024,9 +1024,8 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, # preallocate data 2d list self.blocks = self.obj._data.blocks - ncols = sum(len(b.items) for b in self.blocks) + ncols = sum(b.shape[0] for b in self.blocks) self.data = [None] * ncols - self.column_map = self.obj._data.get_items_map(use_cached=False) if chunksize is None: chunksize = (100000 / (len(self.cols) or 1)) or 1 @@ -1293,10 +1292,9 @@ def _save_chunk(self, start_i, end_i): float_format=self.float_format, date_format=self.date_format) - for i, item in enumerate(b.items): - + for col_loc, col in zip(b.ref_locs, d): # self.data is a preallocated list - self.data[self.column_map[b][i]] = d[i] + self.data[col_loc] = col ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format, diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 23736dafe3556..c32ca065d785d 100755 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1043,9 +1043,11 @@ def to_panel(self): new_blocks = [] for block in selfsorted._data.blocks: - newb = block2d_to_blocknd(block.values.T, block.items, shape, - [major_labels, minor_labels], - ref_items=selfsorted.columns) + newb = block2d_to_blocknd( + values=block.values.T, + placement=block.ref_locs, shape=shape, + labels=[major_labels, minor_labels], + ref_items=selfsorted.columns) new_blocks.append(newb) # preserve names, if any @@ -1934,7 +1936,9 @@ def _ensure_valid_index(self, value): raise ValueError('Cannot set a frame with no defined index ' 'and a value that cannot be converted to a ' 'Series') - self._data.set_axis(1, value.index.copy(), check_axis=False) + + self._data = self._data.reindex_axis(value.index.copy(), axis=1, + fill_value=np.nan) # we are a scalar # noop @@ -2039,7 +2043,11 @@ def _sanitize_column(self, key, value): @property def _series(self): - return self._data.get_series_dict() + result = {} + for idx, item in enumerate(self.columns): + result[item] = Series(self._data.iget(idx), index=self.index, + name=item) + return result def lookup(self, row_labels, col_labels): """Label-based "fancy indexing" function for DataFrame. @@ -2629,16 +2637,14 @@ def trans(v): indexer = _nargsort(labels, kind=kind, ascending=ascending, na_position=na_position) + bm_axis = self._get_block_manager_axis(axis) + new_data = self._data.take(indexer, axis=bm_axis, + convert=False, verify=False) + if inplace: - if axis == 1: - new_data = self._data.reindex_items( - self._data.items[indexer], - copy=False) - elif axis == 0: - new_data = self._data.take(indexer) - self._update_inplace(new_data) + return self._update_inplace(new_data) else: - return self.take(indexer, axis=axis, convert=False, is_copy=False) + return self._constructor(new_data).__finalize__(self) def sortlevel(self, level=0, axis=0, ascending=True, inplace=False): """ @@ -2673,16 +2679,13 @@ def sortlevel(self, level=0, axis=0, ascending=True, inplace=False): else: return self.take(indexer, axis=axis, convert=False) + bm_axis = self._get_block_manager_axis(axis) + new_data = self._data.take(indexer, axis=bm_axis, + convert=False, verify=False) if inplace: - if axis == 1: - new_data = self._data.reindex_items( - self._data.items[indexer], - copy=False) - elif axis == 0: - new_data = self._data.take(indexer) - self._update_inplace(new_data) + return self._update_inplace(new_data) else: - return self.take(indexer, axis=axis, convert=False, is_copy=False) + return self._constructor(new_data).__finalize__(self) def swaplevel(self, i, j, axis=0): """ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d894289c87eee..7c5c77a29f465 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -565,7 +565,7 @@ def f(x): f = _get_rename_function(v) baxis = self._get_block_manager_axis(axis) - result._data = result._data.rename(f, axis=baxis, copy=copy) + result._data = result._data.rename_axis(f, axis=baxis, copy=copy) result._clear_item_cache() if inplace: @@ -1217,21 +1217,9 @@ def take(self, indices, axis=0, convert=True, is_copy=True): taken : type of caller """ - # check/convert indicies here - if convert: - axis = self._get_axis_number(axis) - indices = _maybe_convert_indices( - indices, len(self._get_axis(axis))) - - baxis = self._get_block_manager_axis(axis) - if baxis == 0: - labels = self._get_axis(axis) - new_items = labels.take(indices) - new_data = self._data.reindex_axis(new_items, indexer=indices, - axis=baxis) - else: - new_data = self._data.take(indices, axis=baxis) - + new_data = self._data.take(indices, + axis=self._get_block_manager_axis(axis), + convert=True, verify=True) result = self._constructor(new_data).__finalize__(self) # maybe set copy if we didn't actually change the index @@ -1716,30 +1704,15 @@ def _reindex_with_indexers(self, reindexers, method=None, if index is None: continue - index = _ensure_index(index) - # reindex the axis - if method is not None: - new_data = new_data.reindex_axis( - index, indexer=indexer, method=method, axis=baxis, - fill_value=fill_value, limit=limit, copy=copy) - - elif indexer is not None: - # TODO: speed up on homogeneous DataFrame objects + index = _ensure_index(index) + if indexer is not None: indexer = com._ensure_int64(indexer) - new_data = new_data.reindex_indexer(index, indexer, axis=baxis, - fill_value=fill_value, - allow_dups=allow_dups) - - elif (baxis == 0 and index is not None and - index is not new_data.axes[baxis]): - new_data = new_data.reindex_items(index, copy=copy, - fill_value=fill_value) - - elif (baxis > 0 and index is not None and - index is not new_data.axes[baxis]): - new_data = new_data.copy(deep=copy) - new_data.set_axis(baxis, index) + + # TODO: speed up on homogeneous DataFrame objects + new_data = new_data.reindex_indexer(index, indexer, axis=baxis, + fill_value=fill_value, + allow_dups=allow_dups) if copy and new_data is self._data: new_data = new_data.copy() diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index c0222ad248e0c..b284e3c63209d 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2196,10 +2196,10 @@ def _iterate_slices(self): yield val, slicer(val) def _cython_agg_general(self, how, numeric_only=True): - new_blocks = self._cython_agg_blocks(how, numeric_only=numeric_only) - return self._wrap_agged_blocks(new_blocks) + new_items, new_blocks = self._cython_agg_blocks(how, numeric_only=numeric_only) + return self._wrap_agged_blocks(new_items, new_blocks) - def _wrap_agged_blocks(self, blocks): + def _wrap_agged_blocks(self, items, blocks): obj = self._obj_with_exclusions new_axes = list(obj._data.axes) @@ -2210,6 +2210,10 @@ def _wrap_agged_blocks(self, blocks): else: new_axes[self.axis] = self.grouper.result_index + # Make sure block manager integrity check passes. + assert new_axes[0].equals(items) + new_axes[0] = items + mgr = BlockManager(blocks, new_axes) new_obj = type(obj)(mgr) @@ -2223,14 +2227,14 @@ def _cython_agg_blocks(self, how, numeric_only=True): new_blocks = [] + if numeric_only: + data = data.get_numeric_data(copy=False) + for block in data.blocks: values = block.values is_numeric = is_numeric_dtype(values.dtype) - if numeric_only and not is_numeric: - continue - if is_numeric: values = com.ensure_float(values) @@ -2239,13 +2243,13 @@ def _cython_agg_blocks(self, how, numeric_only=True): # see if we can cast the block back to the original dtype result = block._try_cast_result(result) - newb = make_block(result, block.items, block.ref_items) + newb = make_block(result, placement=block.ref_locs) new_blocks.append(newb) if len(new_blocks) == 0: raise DataError('No numeric types to aggregate') - return new_blocks + return data.items, new_blocks def _get_data_to_aggregate(self): obj = self._obj_with_exclusions @@ -2837,28 +2841,10 @@ def _wrap_aggregated_output(self, output, names=None): return result.convert_objects() - def _wrap_agged_blocks(self, blocks): - obj = self._obj_with_exclusions - - if self.axis == 0: - agg_labels = obj.columns - else: - agg_labels = obj.index - - if sum(len(x.items) for x in blocks) == len(agg_labels): - output_keys = agg_labels - else: - all_items = [] - for b in blocks: - all_items.extend(b.items) - output_keys = agg_labels[agg_labels.isin(all_items)] - - for blk in blocks: - blk.set_ref_items(output_keys, maybe_rename=False) - + def _wrap_agged_blocks(self, items, blocks): if not self.as_index: index = np.arange(blocks[0].values.shape[1]) - mgr = BlockManager(blocks, [output_keys, index]) + mgr = BlockManager(blocks, [items, index]) result = DataFrame(mgr) group_levels = self.grouper.get_group_levels() @@ -2869,7 +2855,7 @@ def _wrap_agged_blocks(self, blocks): result = result.consolidate() else: index = self.grouper.result_index - mgr = BlockManager(blocks, [output_keys, index]) + mgr = BlockManager(blocks, [items, index]) result = DataFrame(mgr) if self.axis == 1: diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 792a310c8a554..9c5564941cd08 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1,30 +1,35 @@ +import copy import itertools import re import operator from datetime import datetime, timedelta -import copy from collections import defaultdict import numpy as np from pandas.core.base import PandasObject +from pandas.hashtable import Factorizer from pandas.core.common import (_possibly_downcast_to_dtype, isnull, notnull, _NS_DTYPE, _TD_DTYPE, ABCSeries, is_list_like, ABCSparseSeries, _infer_dtype_from_scalar, - _values_from_object, _is_null_datelike_scalar) -from pandas.core.index import Index, MultiIndex, _ensure_index + _is_null_datelike_scalar, + is_timedelta64_dtype, is_datetime64_dtype,) +from pandas.core.index import Index, MultiIndex, _ensure_index, _all_indexes_same from pandas.core.indexing import (_maybe_convert_indices, _length_of_indexer) import pandas.core.common as com from pandas.sparse.array import _maybe_to_sparse, SparseArray import pandas.lib as lib import pandas.tslib as tslib import pandas.computation.expressions as expressions +from pandas.util.decorators import cache_readonly from pandas.tslib import Timestamp from pandas import compat -from pandas.compat import range, lrange, lmap, callable, map, zip, u +from pandas.compat import (range, lrange, lmap, callable, map, zip, u, + OrderedDict) from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type + class Block(PandasObject): """ @@ -33,7 +38,7 @@ class Block(PandasObject): Index-ignorant; let the container take care of that """ - __slots__ = ['items', 'ref_items', '_ref_locs', 'values', 'ndim'] + __slots__ = ['_ref_locs', 'values', 'ndim'] is_numeric = False is_float = False is_integer = False @@ -49,8 +54,7 @@ class Block(PandasObject): _verify_integrity = True _ftype = 'dense' - def __init__(self, values, items, ref_items, ndim=None, fastpath=False, - placement=None): + def __init__(self, values, placement, ndim=None, fastpath=False): if ndim is None: ndim = values.ndim @@ -58,21 +62,14 @@ def __init__(self, values, items, ref_items, ndim=None, fastpath=False, if values.ndim != ndim: raise ValueError('Wrong number of dimensions') - if len(items) != len(values): - raise ValueError('Wrong number of items passed %d, index implies ' - '%d' % (len(values), len(items))) + if len(placement) != len(values): + raise ValueError('Wrong number of items passed %d, placement implies ' + '%d' % (len(values), len(placement))) - self.set_ref_locs(placement) + self._ref_locs = np.array(placement, dtype=np.int_, copy=True) self.values = values self.ndim = ndim - if fastpath: - self.items = items - self.ref_items = ref_items - else: - self.items = _ensure_index(items) - self.ref_items = _ensure_index(ref_items) - @property def _consolidate_key(self): return (self._can_consolidate, self.dtype.name) @@ -92,79 +89,8 @@ def fill_value(self): @property def ref_locs(self): - if self._ref_locs is None: - # we have a single block, maybe have duplicates - # but indexer is easy - # also if we are not really reindexing, just numbering - if self._is_single_block or self.ref_items.equals(self.items): - indexer = np.arange(len(self.items)) - else: - - indexer = self.ref_items.get_indexer(self.items) - indexer = com._ensure_platform_int(indexer) - if (indexer == -1).any(): - - # this means that we have nan's in our block - try: - indexer[indexer == -1] = np.arange( - len(self.items))[isnull(self.items)] - except: - raise AssertionError('Some block items were not in ' - 'block ref_items') - - self._ref_locs = indexer return self._ref_locs - def take_ref_locs(self, indexer): - """ - need to preserve the ref_locs and just shift them - return None if ref_locs is None - - see GH6509 - """ - - ref_locs = self._ref_locs - if ref_locs is None: - return None - - tindexer = np.ones(len(ref_locs),dtype=bool) - tindexer[indexer] = False - tindexer = tindexer.astype(int).cumsum()[indexer] - ref_locs = ref_locs[indexer] - - # Make sure the result is a copy, or otherwise self._ref_locs will be - # updated. - if ref_locs.base is not None: - ref_locs = ref_locs.copy() - - ref_locs -= tindexer - return ref_locs - - def reset_ref_locs(self): - """ reset the block ref_locs """ - self._ref_locs = np.empty(len(self.items), dtype='int64') - - def set_ref_locs(self, placement): - """ explicity set the ref_locs indexer, only necessary for duplicate - indicies - """ - if placement is None: - self._ref_locs = None - else: - self._ref_locs = np.array(placement, dtype='int64', copy=True) - - def set_ref_items(self, ref_items, maybe_rename=True): - """ - If maybe_rename=True, need to set the items for this guy - """ - if not isinstance(ref_items, Index): - raise AssertionError('block ref_items must be an Index') - if maybe_rename == 'clear': - self._ref_locs = None - elif maybe_rename: - self.items = ref_items.take(self.ref_locs) - self.ref_items = ref_items - def __unicode__(self): # don't want to print out all of the items here @@ -178,32 +104,38 @@ def __unicode__(self): shape = ' x '.join([com.pprint_thing(s) for s in self.shape]) result = '%s: %s, %s, dtype: %s' % ( - name, com.pprint_thing(self.items), shape, self.dtype) + name, com.pprint_thing(self.ref_locs), shape, self.dtype) return result - def __contains__(self, item): - return item in self.items - def __len__(self): return len(self.values) def __getstate__(self): - # should not pickle generally (want to share ref_items), but here for - # completeness - return (self.items, self.ref_items, self.values) + return self.ref_locs, self.values def __setstate__(self, state): - items, ref_items, values = state - self.items = _ensure_index(items) - self.ref_items = _ensure_index(ref_items) - self.values = values - self.ndim = values.ndim + self._ref_locs, self.values = state + self.ndim = self.values.ndim def _slice(self, slicer): """ return a slice of my values """ return self.values[slicer] + def _getitem_block(self, slicer): + """ + Perform __getitem__-like, return result as block. + """ + if isinstance(slicer, tuple): + axis0_slicer = slicer[0] + else: + axis0_slicer = slicer + + return self.__class__(values=self.values[slicer], + ndim=self.ndim, + fastpath=True, + placement=self.ref_locs[axis0_slicer]) + @property def shape(self): return self.values.shape @@ -223,19 +155,11 @@ def ftype(self): def as_block(self, result): """ if we are not a block, then wrap as a block, must have compatible shape """ if not isinstance(result, Block): - result = make_block(result, - self.items, - self.ref_items) + result = make_block(values=result, placement=self.ref_locs,) return result def merge(self, other): - if not self.ref_items.equals(other.ref_items): - raise AssertionError('Merge operands must have same ref_items') - - # Not sure whether to allow this or not - # if not union_ref.equals(other.ref_items): - # union_ref = self.ref_items + other.ref_items - return _merge_blocks([self, other], self.ref_items) + return _merge_blocks([self, other]) def reindex_axis(self, indexer, method=None, axis=1, fill_value=None, limit=None, mask_info=None): @@ -249,11 +173,11 @@ def reindex_axis(self, indexer, method=None, axis=1, fill_value=None, new_values = com.take_nd(self.values, indexer, axis, fill_value=fill_value, mask_info=mask_info) - return make_block(new_values, self.items, self.ref_items, + return make_block(new_values, ndim=self.ndim, fastpath=True, - placement=self._ref_locs) + placement=self.ref_locs) - def reindex_items_from(self, new_ref_items, indexer=None, method=None, + def reindex_items_from(self, indexer, method=None, fill_value=None, limit=None, copy=True): """ Reindex to only those items contained in the input set of items @@ -265,45 +189,16 @@ def reindex_items_from(self, new_ref_items, indexer=None, method=None, ------- reindexed : Block """ - if indexer is None: - new_ref_items, indexer = self.items.reindex(new_ref_items, - limit=limit) - - needs_fill = method is not None if fill_value is None: fill_value = self.fill_value - new_items = new_ref_items - if indexer is None: - new_values = self.values.copy() if copy else self.values - - else: - - # single block reindex, filling is already happending - if self.ndim == 1: - new_values = com.take_1d(self.values, indexer, - fill_value=fill_value) - block = make_block(new_values, new_items, new_ref_items, - ndim=self.ndim, fastpath=True) - return block - else: - - masked_idx = indexer[indexer != -1] - new_items = self.items.take(masked_idx) - new_values = com.take_nd(self.values, masked_idx, axis=0, - allow_fill=False) - # fill if needed - if needs_fill: - new_values = com.interpolate_2d(new_values, method=method, - limit=limit, fill_value=fill_value) - - block = make_block(new_values, new_items, new_ref_items, - ndim=self.ndim, fastpath=True) - - # down cast if needed - if not self.is_float and (needs_fill or notnull(fill_value)): - block = block.downcast() - + # single block only + assert self.ndim == 1 + new_values = com.take_1d(self.values, indexer, + fill_value=fill_value) + block = make_block(new_values, + ndim=self.ndim, fastpath=True, + placement=np.arange(len(new_values))) return block def get(self, item): @@ -313,7 +208,7 @@ def get(self, item): def iget(self, i): return self.values[i] - def set(self, item, value, check=False): + def set(self, locs, values, check=False): """ Modify Block in-place with new item value @@ -321,20 +216,18 @@ def set(self, item, value, check=False): ------- None """ - loc = self.items.get_loc(item) - self.values[loc] = value + self.values[locs] = values - def delete(self, item): + def delete(self, loc): """ Returns ------- y : Block (new object) """ - loc = self.items.get_loc(item) - new_items = self.items.delete(loc) new_values = np.delete(self.values, loc, 0) - return make_block(new_values, new_items, self.ref_items, - ndim=self.ndim, klass=self.__class__, fastpath=True) + return make_block(new_values, + ndim=self.ndim, klass=self.__class__, fastpath=True, + placement=np.delete(self.ref_locs, loc)) def split_block_at(self, item): """ @@ -355,9 +248,8 @@ def split_block_at(self, item): mask = -loc for s, e in com.split_ranges(mask): + # FIXME: drop this function yield make_block(self.values[s:e], - self.items[s:e].copy(), - self.ref_items, ndim=self.ndim, klass=self.__class__, fastpath=True) @@ -415,8 +307,8 @@ def downcast(self, dtypes=None): dtypes = 'infer' nv = _possibly_downcast_to_dtype(values, dtypes) - return [make_block(nv, self.items, self.ref_items, ndim=self.ndim, - fastpath=True)] + return [make_block(nv, ndim=self.ndim, + fastpath=True, placement=self.ref_locs)] # ndim > 1 if dtypes is None: @@ -429,11 +321,12 @@ def downcast(self, dtypes=None): # item-by-item # this is expensive as it splits the blocks items-by-item blocks = [] - for i, item in enumerate(self.items): + for i, rl in enumerate(self.ref_locs): if dtypes == 'infer': dtype = 'infer' else: + raise AssertionError("dtypes as dict is not supported yet") dtype = dtypes.get(item, self._downcast_dtype) if dtype is None: @@ -442,8 +335,9 @@ def downcast(self, dtypes=None): nv = _possibly_downcast_to_dtype(values[i], dtype) nv = _block_shape(nv, ndim=self.ndim) - blocks.append(make_block(nv, Index([item]), self.ref_items, - ndim=self.ndim, fastpath=True)) + blocks.append(make_block(nv, + ndim=self.ndim, fastpath=True, + placement=[rl])) return blocks @@ -466,9 +360,11 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None, try: # force the copy here if values is None: - values = com._astype_nansafe(self.values, dtype, copy=True) - newb = make_block(values, self.items, self.ref_items, - ndim=self.ndim, placement=self._ref_locs, + # _astype_nansafe works fine with 1-d only + values = com._astype_nansafe(self.values.ravel(), dtype, copy=True) + values = values.reshape(self.values.shape) + newb = make_block(values, + ndim=self.ndim, placement=self.ref_locs, fastpath=True, dtype=dtype, klass=klass) except: if raise_on_error is True: @@ -482,7 +378,7 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None, "(%s [%s])" % (copy, self.dtype.name, self.itemsize, newb.dtype.name, newb.itemsize)) - return [newb] + return newb def convert(self, copy=True, **kwargs): """ attempt to coerce any object types to better types @@ -497,24 +393,31 @@ def prepare_for_merge(self, **kwargs): def post_merge(self, items, **kwargs): """ we are non-sparse block, try to convert to a sparse block(s) """ - overlap = set(items.keys()) & set(self.items) - if len(overlap): - overlap = _ensure_index(overlap) + sparsified_mask = self.items.isin(items.keys()) - new_blocks = [] - for item in overlap: - dtypes = set(items[item]) + if not sparsified_mask.any(): + return self - # this is a safe bet with multiple dtypes - dtype = list(dtypes)[0] if len(dtypes) == 1 else np.float64 + new_blocks = [] + for i in sparsified_mask.nonzero()[0]: + item = self.items[i] + ref_loc = self.ref_locs[i] - b = make_block(SparseArray(self.get(item), dtype=dtype), - [item], self.ref_items) - new_blocks.append(b) + dtypes = set(items[item]) + # this is a safe bet with multiple dtypes + dtype = list(dtypes)[0] if len(dtypes) == 1 else np.float64 - return new_blocks + new_blocks.append(make_block( + values=SparseArray(self.iget(i), dtype=dtype), + placement=[ref_loc])) - return self + nonsparsified_locs = (~sparsified_mask).nonzero()[0] + if len(nonsparsified_locs): + new_blocks.append(make_block( + values=self.values[nonsparsified_locs], + placement=self.ref_locs[nonsparsified_locs])) + + return new_blocks def _can_hold_element(self, value): raise NotImplementedError() @@ -581,15 +484,13 @@ def to_native_types(self, slicer=None, na_rep='', **kwargs): return values.tolist() # block actions #### - def copy(self, deep=True, ref_items=None): + def copy(self, deep=True): values = self.values if deep: values = values.copy() - if ref_items is None: - ref_items = self.ref_items - return make_block(values, self.items, ref_items, ndim=self.ndim, + return make_block(values, ndim=self.ndim, klass=self.__class__, fastpath=True, - placement=self._ref_locs) + placement=self.ref_locs) def replace(self, to_replace, value, inplace=False, filter=None, regex=False): @@ -599,9 +500,8 @@ def replace(self, to_replace, value, inplace=False, filter=None, compatibility.""" mask = com.mask_missing(self.values, to_replace) if filter is not None: - for i, item in enumerate(self.items): - if item not in filter: - mask[i] = False + filtered_out = ~Index(self.ref_locs, copy=False).isin(filter) + mask[filtered_out.nonzero()[0]] = False if not mask.any(): if inplace: @@ -672,7 +572,7 @@ def setitem(self, indexer, value): dtype = 'infer' values = self._try_coerce_result(values) values = self._try_cast_result(values, dtype) - return [make_block(transf(values), self.items, self.ref_items, + return [make_block(transf(values), ndim=self.ndim, placement=self._ref_locs, fastpath=True)] except (ValueError, TypeError) as detail: @@ -704,21 +604,11 @@ def putmask(self, mask, new, align=True, inplace=False): # may need to align the new if hasattr(new, 'reindex_axis'): - if align: - axis = getattr(new, '_info_axis_number', 0) - new = new.reindex_axis(self.items, axis=axis, - copy=False).values.T - else: - new = new.values.T + new = new.values.T # may need to align the mask if hasattr(mask, 'reindex_axis'): - if align: - axis = getattr(mask, '_info_axis_number', 0) - mask = mask.reindex_axis( - self.items, axis=axis, copy=False).values.T - else: - mask = mask.values.T + mask = mask.values.T # if we are passed a scalar None, convert it here if not is_list_like(new) and isnull(new): @@ -738,45 +628,8 @@ def putmask(self, mask, new, align=True, inplace=False): # need to go column by column new_blocks = [] - - def create_block(v, m, n, item, reshape=True): - """ return a new block, try to preserve dtype if possible """ - - # n should be the length of the mask or a scalar here - if not is_list_like(n): - n = np.array([n] * len(m)) - - # see if we are only masking values that if putted - # will work in the current dtype - nv = None - try: - nn = n[m] - nn_at = nn.astype(self.dtype) - if (nn == nn_at).all(): - nv = v.copy() - nv[mask] = nn_at - except (ValueError, IndexError, TypeError): - pass - - # change the dtype - if nv is None: - dtype, _ = com._maybe_promote(n.dtype) - nv = v.astype(dtype) - try: - nv[m] = n - except ValueError: - idx, = np.where(np.squeeze(m)) - for mask_index, new_val in zip(idx, n): - nv[mask_index] = new_val - - if reshape: - nv = _block_shape(nv) - return make_block(nv, [item], self.ref_items) - else: - return make_block(nv, item, self.ref_items) - if self.ndim > 1: - for i, item in enumerate(self.items): + for i, ref_loc in enumerate(self.ref_locs): m = mask[i] v = new_values[i] @@ -792,27 +645,31 @@ def create_block(v, m, n, item, reshape=True): # we need to exiplicty astype here to make a copy n = n.astype(dtype) - block = create_block(v, m, n, item) - + nv = _putmask_smart(v, m, n) else: nv = v if inplace else v.copy() - nv = _block_shape(nv) - block = make_block( - nv, Index([item]), self.ref_items, fastpath=True) + + # Put back the dimension that was taken from it and make + # a block out of the result. + block = make_block(values=nv[np.newaxis], + placement=[ref_loc], + fastpath=True) new_blocks.append(block) else: - new_blocks.append(create_block(new_values, mask, new, - self.items, reshape=False)) + nv = _putmask_smart(new_values, mask, new) + new_blocks.append(make_block(values=nv, + placement=self.ref_locs, + fastpath=True)) return new_blocks if inplace: return [self] - return [make_block(new_values, self.items, self.ref_items, - placement=self._ref_locs, fastpath=True)] + return [make_block(new_values, + placement=self.ref_locs, fastpath=True)] def interpolate(self, method='pad', axis=0, index=None, values=None, inplace=False, limit=None, @@ -891,9 +748,9 @@ def _interpolate_with_fill(self, method='pad', axis=0, inplace=False, dtype=self.dtype) values = self._try_coerce_result(values) - blocks = [make_block(values, self.items, self.ref_items, + blocks = [make_block(values, ndim=self.ndim, klass=self.__class__, - fastpath=True)] + fastpath=True, placement=self.ref_locs)] return self._maybe_downcast(blocks, downcast) def _interpolate(self, method=None, index=None, values=None, @@ -930,11 +787,12 @@ def func(x): # interp each column independently interp_values = np.apply_along_axis(func, axis, data) - blocks = [make_block(interp_values, self.items, self.ref_items, - ndim=self.ndim, klass=self.__class__, fastpath=True)] + blocks = [make_block(interp_values, + ndim=self.ndim, klass=self.__class__, + fastpath=True, placement=self.ref_locs)] return self._maybe_downcast(blocks, downcast) - def take(self, indexer, ref_items, new_axis, axis=1): + def take(self, indexer, new_axis, axis=1): if axis < 1: raise AssertionError('axis must be at least 1, got %d' % axis) new_values = com.take_nd(self.values, indexer, axis=axis, @@ -946,20 +804,18 @@ def take(self, indexer, ref_items, new_axis, axis=1): if not new_axis.is_unique: ref_locs = self._ref_locs - return [make_block(new_values, self.items, ref_items, ndim=self.ndim, + return [make_block(new_values, ndim=self.ndim, klass=self.__class__, placement=ref_locs, fastpath=True)] def get_values(self, dtype=None): return self.values - def get_merge_length(self): - return len(self.values) - def diff(self, n): """ return block for the diff of the values """ new_values = com.diff(self.values, n, axis=1) - return [make_block(new_values, self.items, self.ref_items, - ndim=self.ndim, fastpath=True)] + return [make_block(values=new_values, + ndim=self.ndim, fastpath=True, + placement=self.ref_locs)] def shift(self, periods, axis=0): """ shift the block by periods, possibly upcast """ @@ -983,8 +839,9 @@ def shift(self, periods, axis=0): if f_ordered: new_values = new_values.T - return [make_block(new_values, self.items, self.ref_items, - ndim=self.ndim, fastpath=True)] + return [make_block(new_values, + ndim=self.ndim, fastpath=True, + placement=self.ref_locs)] def eval(self, func, other, raise_on_error=True, try_cast=False): """ @@ -1003,11 +860,8 @@ def eval(self, func, other, raise_on_error=True, try_cast=False): """ values = self.values - # see if we can align other if hasattr(other, 'reindex_axis'): - axis = getattr(other, '_info_axis_number', 0) - other = other.reindex_axis( - self.items, axis=axis, copy=False).values + other = other.values # make sure that we can broadcast is_transposed = False @@ -1078,8 +932,8 @@ def handle_error(): if try_cast: result = self._try_cast_result(result) - return [make_block(result, self.items, self.ref_items, ndim=self.ndim, - fastpath=True)] + return [make_block(result, ndim=self.ndim, + fastpath=True, placement=self.ref_locs)] def where(self, other, cond, align=True, raise_on_error=True, try_cast=False): @@ -1103,12 +957,7 @@ def where(self, other, cond, align=True, raise_on_error=True, # see if we can align other if hasattr(other, 'reindex_axis'): - if align: - axis = getattr(other, '_info_axis_number', 0) - other = other.reindex_axis(self.items, axis=axis, - copy=True).values - else: - other = other.values + other = other.values # make sure that we can broadcast is_transposed = False @@ -1129,10 +978,7 @@ def where(self, other, cond, align=True, raise_on_error=True, raise ValueError( "where must have a condition that is ndarray like") - if align and hasattr(cond, 'reindex_axis'): - axis = getattr(cond, '_info_axis_number', 0) - cond = cond.reindex_axis(self.items, axis=axis, copy=True).values - else: + if hasattr(cond, 'reindex_axis'): cond = cond.values # may need to undo transpose of values @@ -1177,8 +1023,8 @@ def func(c, v, o): if try_cast: result = self._try_cast_result(result) - return make_block(result, self.items, self.ref_items, - ndim=self.ndim) + return make_block(result, + ndim=self.ndim, placement=self.ref_locs) # might need to separate out blocks axis = cond.ndim - 1 @@ -1189,11 +1035,10 @@ def func(c, v, o): result_blocks = [] for m in [mask, ~mask]: if m.any(): - items = self.items[m] - slices = [slice(None)] * cond.ndim - slices[axis] = self.items.get_indexer(items) - r = self._try_cast_result(result[slices]) - result_blocks.append(make_block(r.T, items, self.ref_items)) + r = self._try_cast_result( + result.take(m.nonzero()[0], axis=axis)) + result_blocks.append(make_block(r.T, + placement=self.ref_locs[m])) return result_blocks @@ -1410,12 +1255,12 @@ class ObjectBlock(Block): is_object = True _can_hold_na = True - def __init__(self, values, items, ref_items, ndim=2, fastpath=False, + def __init__(self, values, ndim=2, fastpath=False, placement=None): if issubclass(values.dtype.type, compat.string_types): values = np.array(values, dtype=object) - super(ObjectBlock, self).__init__(values, items, ref_items, ndim=ndim, + super(ObjectBlock, self).__init__(values, ndim=ndim, fastpath=fastpath, placement=placement) @@ -1436,11 +1281,10 @@ def convert(self, convert_dates=True, convert_numeric=True, convert_timedeltas=T """ # attempt to create new type blocks - is_unique = self.items.is_unique blocks = [] if by_item and not self._is_single_block: - for i, c in enumerate(self.items): + for i, rl in enumerate(self.ref_locs): values = self.iget(i) values = com._possibly_convert_objects( @@ -1449,10 +1293,8 @@ def convert(self, convert_dates=True, convert_numeric=True, convert_timedeltas=T convert_timedeltas=convert_timedeltas, ).reshape(values.shape) values = _block_shape(values, ndim=self.ndim) - items = self.items.take([i]) - placement = None if is_unique else [i] - newb = make_block(values, items, self.ref_items, - ndim=self.ndim, placement=placement) + newb = make_block(values, + ndim=self.ndim, placement=[rl]) blocks.append(newb) else: @@ -1461,12 +1303,12 @@ def convert(self, convert_dates=True, convert_numeric=True, convert_timedeltas=T self.values.ravel(), convert_dates=convert_dates, convert_numeric=convert_numeric ).reshape(self.values.shape) - blocks.append(make_block(values, self.items, self.ref_items, - ndim=self.ndim)) + blocks.append(make_block(values, + ndim=self.ndim, placement=self.ref_locs)) return blocks - def set(self, item, value, check=False): + def set(self, locs, values, check=False): """ Modify Block in-place with new item value @@ -1475,26 +1317,24 @@ def set(self, item, value, check=False): None """ - loc = self.items.get_loc(item) - # GH6026 if check: try: - if (self.values[loc] == value).all(): + if (self.values[locs] == values).all(): return except: pass try: - self.values[loc] = value + self.values[locs] = values except (ValueError): # broadcasting error # see GH6171 - new_shape = list(value.shape) + new_shape = list(values.shape) new_shape[0] = len(self.items) self.values = np.empty(tuple(new_shape),dtype=self.dtype) self.values.fill(np.nan) - self.values[loc] = value + self.values[locs] = values def _maybe_downcast(self, blocks, downcast=None): @@ -1613,27 +1453,29 @@ def re_replacer(s): f = np.vectorize(re_replacer, otypes=[self.dtype]) - try: - filt = lmap(self.items.get_loc, filter) - except TypeError: + if filter is None: filt = slice(None) + else: + filt = (Index(self.ref_locs, copy=False) + .isin(filter).nonzero()[0]) new_values[filt] = f(new_values[filt]) - return [self if inplace else make_block(new_values, self.items, - self.ref_items, fastpath=True)] + return [self if inplace else + make_block(new_values, + fastpath=True, placement=self.ref_locs)] class DatetimeBlock(Block): is_datetime = True _can_hold_na = True - def __init__(self, values, items, ref_items, fastpath=False, - placement=None, **kwargs): + def __init__(self, values, placement, + fastpath=False, **kwargs): if values.dtype != _NS_DTYPE: values = tslib.cast_to_nanoseconds(values) - super(DatetimeBlock, self).__init__(values, items, ref_items, + super(DatetimeBlock, self).__init__(values, fastpath=True, placement=placement, **kwargs) @@ -1705,7 +1547,8 @@ def fillna(self, value, limit=None, np.putmask(values, mask, value) return [self if inplace else - make_block(values, self.items, self.ref_items, fastpath=True)] + make_block(values, + fastpath=True, placement=self.ref_locs)] def to_native_types(self, slicer=None, na_rep=None, date_format=None, **kwargs): @@ -1745,7 +1588,7 @@ def astype(self, dtype, copy=False, raise_on_error=True): return self._astype(dtype, copy=copy, raise_on_error=raise_on_error, klass=klass) - def set(self, item, value, check=False): + def set(self, locs, values, check=False): """ Modify Block in-place with new item value @@ -1753,12 +1596,11 @@ def set(self, item, value, check=False): ------- None """ - loc = self.items.get_loc(item) - - if value.dtype != _NS_DTYPE: - value = tslib.cast_to_nanoseconds(value) + if values.dtype != _NS_DTYPE: + # Workaround for numpy 1.6 bug + values = tslib.cast_to_nanoseconds(values) - self.values[loc] = value + self.values[locs] = values def get_values(self, dtype=None): # return object dtype as Timestamps @@ -1771,7 +1613,7 @@ def get_values(self, dtype=None): class SparseBlock(Block): """ implement as a list of sparse arrays of the same dtype """ - __slots__ = ['items', 'ref_items', '_ref_locs', 'ndim', 'values'] + __slots__ = ['_ref_locs', 'ndim', 'values'] is_sparse = True is_numeric = True _can_hold_na = True @@ -1779,8 +1621,8 @@ class SparseBlock(Block): _verify_integrity = False _ftype = 'sparse' - def __init__(self, values, items, ref_items, ndim=None, fastpath=False, - placement=None): + def __init__(self, values, placement, + ndim=None, fastpath=False,): # kludgetastic if ndim is not None: @@ -1789,24 +1631,19 @@ def __init__(self, values, items, ref_items, ndim=None, fastpath=False, elif ndim > 2: ndim = ndim else: - if len(items) != 1: + if len(placement) != 1: ndim = 1 else: ndim = 2 self.ndim = ndim - self._ref_locs = None + self._ref_locs = np.array(placement, dtype=np.int_, copy=True) + self.values = values - if fastpath: - self.items = items - self.ref_items = ref_items - else: - self.items = _ensure_index(items) - self.ref_items = _ensure_index(ref_items) @property def shape(self): - return (len(self.items), self.sp_index.length) + return (len(self.ref_locs), self.sp_index.length) @property def itemsize(self): @@ -1834,6 +1671,11 @@ def sp_values(self, v): kind=self.kind, dtype=v.dtype, fill_value=self.fill_value, copy=False) + def iget(self, col): + if col != 0: + raise IndexError("SparseBlock only contains one item") + return self.values + @property def sp_index(self): return self.values.sp_index @@ -1851,15 +1693,9 @@ def __len__(self): def should_store(self, value): return isinstance(value, SparseArray) - def prepare_for_merge(self, **kwargs): - """ create a dense block """ - return make_block(self.get_values(), self.items, self.ref_items) - - def post_merge(self, items, **kwargs): - return self - - def set(self, item, value, check=False): - self.values = value + def set(self, locs, values, check=False): + assert locs.tolist() == [0] + self.values = values def get(self, item): if self.ndim == 1: @@ -1879,33 +1715,33 @@ def get_values(self, dtype=None): values = values.reshape((1,) + values.shape) return values - def get_merge_length(self): - return 1 - - def make_block(self, values, items=None, ref_items=None, sparse_index=None, - kind=None, dtype=None, fill_value=None, copy=False, - fastpath=True): + def copy(self, deep=True): + return self.make_block(values=self.values, + sparse_index=self.sp_index, + kind=self.kind, copy=deep, + placement=self.ref_locs) + + def make_block(self, values, placement, + sparse_index=None, kind=None, dtype=None, fill_value=None, + copy=False, fastpath=True): """ return a new block """ if dtype is None: dtype = self.dtype if fill_value is None: fill_value = self.fill_value - if items is None: - items = self.items - if ref_items is None: - ref_items = self.ref_items new_values = SparseArray(values, sparse_index=sparse_index, kind=kind or self.kind, dtype=dtype, fill_value=fill_value, copy=copy) - return make_block(new_values, items, ref_items, ndim=self.ndim, - fastpath=fastpath) + return make_block(new_values, ndim=self.ndim, + fastpath=fastpath, placement=placement) def interpolate(self, method='pad', axis=0, inplace=False, limit=None, fill_value=None, **kwargs): values = com.interpolate_2d( self.values.to_dense(), method, axis, limit, fill_value) - return self.make_block(values, self.items, self.ref_items) + return self.make_block(values=values, + placement=self.ref_locs) def fillna(self, value, limit=None, inplace=False, downcast=None): # we may need to upcast our fill to match our dtype @@ -1914,7 +1750,8 @@ def fillna(self, value, limit=None, inplace=False, downcast=None): if issubclass(self.dtype.type, np.floating): value = float(value) values = self.values if inplace else self.values.copy() - return [self.make_block(values.get_values(value), fill_value=value)] + return [self.make_block(values=values.get_values(value), + fill_value=value, placement=self.ref_locs)] def shift(self, periods, axis=0): @@ -1933,9 +1770,9 @@ def shift(self, periods, axis=0): new_values[:periods] = fill_value else: new_values[periods:] = fill_value - return [self.make_block(new_values)] + return [self.make_block(new_values, placement=self.ref_locs)] - def take(self, indexer, ref_items, new_axis, axis=1): + def take(self, indexer, new_axis, axis=1): """ going to take our items along the long dimension""" if axis < 1: @@ -1954,10 +1791,11 @@ def reindex_axis(self, indexer, method=None, axis=1, fill_value=None, # taking on the 0th axis always here if fill_value is None: fill_value = self.fill_value - return self.make_block(self.values.take(indexer), items=self.items, - fill_value=fill_value) + return self.make_block(self.values.take(indexer), + fill_value=fill_value, + placement=self.ref_locs) - def reindex_items_from(self, new_ref_items, indexer=None, method=None, + def reindex_items_from(self, indexer, method=None, fill_value=None, limit=None, copy=True): """ Reindex to only those items contained in the input set of items @@ -1972,25 +1810,11 @@ def reindex_items_from(self, new_ref_items, indexer=None, method=None, # 1-d always if indexer is None: - new_ref_items, indexer = self.items.reindex(new_ref_items, - limit=limit) - if indexer is None: - indexer = np.arange(len(self.items)) - - # single block - if self.ndim == 1: + indexer = np.arange(len(self.ref_locs)) - new_items = new_ref_items - new_values = com.take_1d(self.values.values, indexer) - - else: - - # if we don't overlap at all, then don't include this block - new_items = self.items & new_ref_items - if not len(new_items): - return None - - new_values = self.values.values + # single block only + assert self.ndim == 1 + new_values = com.take_1d(self.values.values, indexer) # fill if needed if method is not None or limit is not None: @@ -1999,8 +1823,9 @@ def reindex_items_from(self, new_ref_items, indexer=None, method=None, new_values = com.interpolate_2d(new_values, method=method, limit=limit, fill_value=fill_value) - return self.make_block(new_values, items=new_items, - ref_items=new_ref_items, copy=copy) + return self.make_block(new_values, + copy=copy, + placement=np.arange(len(indexer))) def sparse_reindex(self, new_index): """ sparse reindex and return a new block @@ -2008,7 +1833,8 @@ def sparse_reindex(self, new_index): values = self.values values = values.sp_index.to_int_index().reindex( values.sp_values.astype('float64'), values.fill_value, new_index) - return self.make_block(values, sparse_index=new_index) + return self.make_block(values, sparse_index=new_index, + placement=self.ref_locs) def split_block_at(self, item): if len(self.items) == 1 and item == self.items[0]: @@ -2019,8 +1845,8 @@ def _try_cast_result(self, result, dtype=None): return result -def make_block(values, items, ref_items, klass=None, ndim=None, dtype=None, - fastpath=False, placement=None): +def make_block(values, placement, klass=None, ndim=None, + dtype=None, fastpath=False): if klass is None: dtype = dtype or values.dtype vtype = dtype.type @@ -2066,7 +1892,7 @@ def make_block(values, items, ref_items, klass=None, ndim=None, dtype=None, if klass is None: klass = ObjectBlock - return klass(values, items, ref_items, ndim=ndim, fastpath=fastpath, + return klass(values, ndim=ndim, fastpath=fastpath, placement=placement) @@ -2082,6 +1908,42 @@ class BlockManager(PandasObject): lightweight blocked set of labeled data to be manipulated by the DataFrame public API class + Attributes + ---------- + shape + ndim + axes + values + items + + Methods + ------- + set_axis(axis, new_labels) + copy(deep=True) + + get_dtype_counts + get_ftype_counts + get_dtypes + get_ftypes + + apply(func, axes, block_filter_fn) + + get_bool_data + get_numeric_data + + get_slice(slice_like, axis) + get(label) + iget(loc) + get_scalar(label_tup) + + take(indexer, axis) + reindex_axis(new_labels, axis) + reindex_indexer(new_labels, indexer, axis) + + delete(label) + insert(loc, label, value) + set(label, value) + Parameters ---------- @@ -2091,18 +1953,21 @@ class BlockManager(PandasObject): This is *not* a public API class """ __slots__ = ['axes', 'blocks', '_ndim', '_shape', '_known_consolidated', - '_is_consolidated', '_has_sparse', '_ref_locs', '_items_map'] + '_is_consolidated', '_has_sparse', '_ref_locs'] def __init__(self, blocks, axes, do_integrity_check=True, fastpath=True): self.axes = [_ensure_index(ax) for ax in axes] self.blocks = blocks - ndim = self.ndim for block in blocks: - if not block.is_sparse and ndim != block.ndim: - raise AssertionError(('Number of Block dimensions (%d) must ' - 'equal number of axes (%d)') - % (block.ndim, ndim)) + if block.is_sparse: + if len(block.ref_locs) != 1: + raise AssertionError("Sparse block refers to multiple items") + else: + if self.ndim != block.ndim: + raise AssertionError(('Number of Block dimensions (%d) must ' + 'equal number of axes (%d)') + % (block.ndim, self.ndim)) if do_integrity_check: self._verify_integrity() @@ -2110,9 +1975,7 @@ def __init__(self, blocks, axes, do_integrity_check=True, fastpath=True): self._has_sparse = False self._consolidate_check() - # we have a duplicate items index, setup the block maps - if not self.items.is_unique: - self._set_ref_locs(do_refs=True) + self._rebuild_ref_locs() def make_empty(self, axes=None): """ return an empty BlockManager with the items axis of len 0 """ @@ -2136,182 +1999,40 @@ def __nonzero__(self): @property def shape(self): - if getattr(self, '_shape', None) is None: - self._shape = tuple(len(ax) for ax in self.axes) - return self._shape + return tuple(len(ax) for ax in self.axes) @property def ndim(self): - if getattr(self, '_ndim', None) is None: - self._ndim = len(self.axes) - return self._ndim + return len(self.axes) - def _set_axis(self, axis, value, check_axis=True): - cur_axis = self.axes[axis] - value = _ensure_index(value) + def set_axis(self, axis, new_labels): + new_labels = _ensure_index(new_labels) + old_len = len(self.axes[axis]) + new_len = len(new_labels) - if check_axis and len(value) != len(cur_axis): + if new_len != old_len: raise ValueError('Length mismatch: Expected axis has %d elements, ' - 'new values have %d elements' % (len(cur_axis), - len(value))) - - self.axes[axis] = value - self._shape = None - return cur_axis, value - - def set_axis(self, axis, value, maybe_rename=True, check_axis=True): - cur_axis, value = self._set_axis(axis, value, check_axis) - - if axis == 0: - - # set/reset ref_locs based on the current index - # and map the new index if needed - self._set_ref_locs(labels=cur_axis) - - # take via ref_locs - for block in self.blocks: - block.set_ref_items(self.items, maybe_rename=maybe_rename) - - # set/reset ref_locs based on the new index - self._set_ref_locs(labels=value, do_refs=True) + 'new values have %d elements' % (old_len, new_len)) - def _reset_ref_locs(self): - """ take the current _ref_locs and reset ref_locs on the blocks - to correctly map, ignoring Nones; - reset both _items_map and _ref_locs """ - - # let's reset the ref_locs in individual blocks - if self.items.is_unique: - for b in self.blocks: - b._ref_locs = None - else: - for b in self.blocks: - b.reset_ref_locs() - self._rebuild_ref_locs() - - self._ref_locs = None - self._items_map = None + self.axes[axis] = new_labels def _rebuild_ref_locs(self): - """Take _ref_locs and set the individual block ref_locs, skipping Nones - no effect on a unique index - """ - if getattr(self, '_ref_locs', None) is not None: - item_count = 0 - for v in self._ref_locs: - if v is not None: - block, item_loc = v - if block._ref_locs is None: - block.reset_ref_locs() - block._ref_locs[item_loc] = item_count - item_count += 1 - - def _set_ref_locs(self, labels=None, do_refs=False): - """ - if we have a non-unique index on this axis, set the indexers - we need to set an absolute indexer for the blocks - return the indexer if we are not unique - - labels : the (new) labels for this manager - ref : boolean, whether to set the labels (one a 1-1 mapping) - - """ - - if labels is None: - labels = self.items - - # we are unique, and coming from a unique - is_unique = labels.is_unique - if is_unique and not do_refs: - - if not self.items.is_unique: - - # reset our ref locs - self._ref_locs = None - for b in self.blocks: - b._ref_locs = None - - return None - - # we are going to a non-unique index - # we have ref_locs on the block at this point - if (not is_unique and do_refs) or do_refs == 'force': - - # create the items map - im = getattr(self, '_items_map', None) - if im is None: - - im = dict() - for block in self.blocks: - - # if we have a duplicate index but - # _ref_locs have not been set - try: - rl = block.ref_locs - except: - raise AssertionError( - 'Cannot create BlockManager._ref_locs because ' - 'block [%s] with duplicate items [%s] does not ' - 'have _ref_locs set' % (block, labels)) - - m = maybe_create_block_in_items_map(im, block) - for i, item in enumerate(block.items): - m[i] = rl[i] - - self._items_map = im - - # create the _ref_loc map here - rl = [None] * len(labels) - for block, items in im.items(): - for i, loc in enumerate(items): - rl[loc] = (block, i) - self._ref_locs = rl - return rl - - elif do_refs: - self._reset_ref_locs() - - # return our cached _ref_locs (or will compute again - # when we recreate the block manager if needed - return getattr(self, '_ref_locs', None) - - def get_items_map(self, use_cached=True): """ - return an inverted ref_loc map for an item index - block -> item (in that block) location -> column location - - use_cached : boolean, use the cached items map, or recreate + Update mgr._ref_locs according to blk.ref_locs. """ + blocks = np.empty(self.shape[0], dtype=np.object_) + blk_locs = np.empty(self.shape[0], dtype=np.int_) + blk_locs.fill(-1) - # cache check - if use_cached: - im = getattr(self, '_items_map', None) - if im is not None: - return im - - im = dict() - rl = self._set_ref_locs() - - # we have a non-duplicative index - if rl is None: - - axis = self.axes[0] - for block in self.blocks: - - m = maybe_create_block_in_items_map(im, block) - for i, item in enumerate(block.items): - m[i] = axis.get_loc(item) - - # use the ref_locs to construct the map - else: - - for i, (block, idx) in enumerate(rl): + for blk in self.blocks: + rl = blk.ref_locs + blocks[rl] = blk + blk_locs[rl] = np.arange(len(rl)) - m = maybe_create_block_in_items_map(im, block) - m[idx] = i + if (blk_locs == -1).any(): + raise AssertionError("Gaps in blk ref_locs") - self._items_map = im - return im + self._ref_locs = lib.fast_zip([blocks, blk_locs]) # make items read only for now def _get_items(self): @@ -2327,23 +2048,6 @@ def _get_counts(self, f): counts[v] = counts.get(v, 0) + b.shape[0] return counts - def _get_types(self, f): - """ return a list of the f per item """ - self._consolidate_inplace() - - # unique - if self.items.is_unique: - l = [ None ] * len(self.items) - for b in self.blocks: - v = f(b) - for rl in b.ref_locs: - l[rl] = v - return l - - # non-unique - ref_locs = self._set_ref_locs() - return [ f(ref_locs[i][0]) for i, item in enumerate(self.items) ] - def get_dtype_counts(self): return self._get_counts(lambda b: b.dtype.name) @@ -2351,14 +2055,14 @@ def get_ftype_counts(self): return self._get_counts(lambda b: b.ftype) def get_dtypes(self): - return self._get_types(lambda b: b.dtype) + return [rl[0].dtype for rl in self._ref_locs] def get_ftypes(self): - return self._get_types(lambda b: b.ftype) + return [rl[0].ftype for rl in self._ref_locs] def __getstate__(self): block_values = [b.values for b in self.blocks] - block_items = [b.items for b in self.blocks] + block_items = [self.items.take(b.ref_locs) for b in self.blocks] axes_array = [ax for ax in self.axes] return axes_array, block_values, block_items @@ -2376,7 +2080,8 @@ def __setstate__(self, state): if values.dtype == 'M8[us]': values = values.astype('M8[ns]') - blk = make_block(values, items, self.axes[0]) + blk = make_block(values, + placement=self.axes[0].get_indexer(items)) blocks.append(blk) self.blocks = blocks @@ -2385,6 +2090,7 @@ def __setstate__(self, state): def _post_setstate(self): self._is_consolidated = False self._known_consolidated = False + self._rebuild_ref_locs() self._set_has_sparse() def __len__(self): @@ -2394,24 +2100,20 @@ def __unicode__(self): output = com.pprint_thing(self.__class__.__name__) for i, ax in enumerate(self.axes): if i == 0: - output += '\nItems: %s' % ax + output += u('\nItems: %s') % ax else: - output += '\nAxis %d: %s' % (i, ax) + output += u('\nAxis %d: %s') % (i, ax) for block in self.blocks: - output += '\n%s' % com.pprint_thing(block) + output += u('\n%s') % com.pprint_thing(block) return output def _verify_integrity(self): mgr_shape = self.shape - tot_items = sum(len(x.items) for x in self.blocks) + tot_items = sum(len(x.ref_locs) for x in self.blocks) for block in self.blocks: - if block.ref_items is not self.items: - raise AssertionError("Block ref_items must be BlockManager " - "items") - if not block.is_sparse and block.values.shape[1:] != mgr_shape[1:]: - construction_error( - tot_items, block.values.shape[1:], self.axes) + if not block.is_sparse and block.shape[1:] != mgr_shape[1:]: + construction_error(tot_items, block.shape[1:], self.axes) if len(self.items) != tot_items: raise AssertionError('Number of manager items must equal union of ' 'block items\n# manager items: {0}, # ' @@ -2437,18 +2139,54 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False, **kwargs): """ result_blocks = [] - for blk in self.blocks: + + if filter is not None: + # filter kwarg is used in replace-* family of methods + filter_locs = set(self.items.get_indexer_for(filter)) + kwargs['filter'] = filter_locs + + if f == 'where' and kwargs.get('align', True): + align_copy = True + align_keys = ['other', 'cond'] + elif f == 'putmask' and kwargs.get('align', True): + align_copy = False + align_keys = ['new', 'mask'] + elif f == 'eval': + align_copy = False + align_keys = ['other'] + elif f == 'fillna': + # fillna internally does putmask, maybe it's better to do this + # at mgr, not block level? + align_copy = False + align_keys = ['value'] + else: + align_keys = [] + + aligned_args = dict((k, kwargs[k]) for k in align_keys + if hasattr(kwargs[k], 'reindex_axis')) + + for b in self.blocks: if filter is not None: - kwargs['filter'] = set(filter) - if not blk.items.isin(filter).any(): - result_blocks.append(blk) + valid_locs = filter_locs.intersection(b.ref_locs) + if not valid_locs: + result_blocks.append(b) continue - applied = getattr(blk, f)(**kwargs) + + if aligned_args: + b_items = self.items.take(b.ref_locs) + + for k, obj in aligned_args.items(): + axis = getattr(obj, '_info_axis_number', 0) + kwargs[k] = obj.reindex_axis(b_items, axis=axis, + copy=align_copy) + + applied = getattr(b, f)(**kwargs) if isinstance(applied, list): result_blocks.extend(applied) else: result_blocks.append(applied) + if len(result_blocks) == 0: return self.make_empty(axes or self.axes) bm = self.__class__(result_blocks, axes or self.axes, @@ -2539,31 +2277,6 @@ def comp(s): bm._consolidate_inplace() return bm - def prepare_for_merge(self, **kwargs): - """ prepare for merging, return a new block manager with - Sparse -> Dense - """ - self._consolidate_inplace() - if self._has_sparse: - return self.apply('prepare_for_merge', **kwargs) - return self - - def post_merge(self, objs, **kwargs): - """ try to sparsify items that were previously sparse """ - is_sparse = defaultdict(list) - for o in objs: - for blk in o._data.blocks: - if blk.is_sparse: - - # record the dtype of each item - for i in blk.items: - is_sparse[i].append(blk.dtype) - - if len(is_sparse): - return self.apply('post_merge', items=is_sparse) - - return self - def is_consolidated(self): """ Return True if more than one block with the same dtype @@ -2599,163 +2312,79 @@ def is_datelike_mixed_type(self): self._consolidate_inplace() return any([block.is_datelike for block in self.blocks]) - def get_block_map(self, copy=False, typ=None, columns=None, - is_numeric=False, is_bool=False): - """ return a dictionary mapping the ftype -> block list - - Parameters - ---------- - typ : return a list/dict - copy : copy if indicated - columns : a column filter list - filter if the type is indicated """ - - # short circuit - mainly for merging - if (typ == 'dict' and columns is None and not is_numeric and - not is_bool and not copy): - bm = defaultdict(list) - for b in self.blocks: - bm[str(b.ftype)].append(b) - return bm - + def get_bool_data(self, copy=False): + """ + Parameters + ---------- + copy : boolean, default False + Whether to copy the blocks + """ self._consolidate_inplace() + return self.combine([b for b in self.blocks if b.is_bool], copy) - if is_numeric: - filter_blocks = lambda block: block.is_numeric - elif is_bool: - filter_blocks = lambda block: block.is_bool - else: - filter_blocks = lambda block: True - - def filter_columns(b): - if columns: - if not columns in b.items: - return None - b = b.reindex_items_from(columns) - return b - - maybe_copy = lambda b: b.copy() if copy else b - - def maybe_copy(b): - if copy: - b = b.copy() - return b - - if typ == 'list': - bm = [] - for b in self.blocks: - if filter_blocks(b): - b = filter_columns(b) - if b is not None: - bm.append(maybe_copy(b)) - - else: - if typ == 'dtype': - key = lambda b: b.dtype - else: - key = lambda b: b.ftype - bm = defaultdict(list) - for b in self.blocks: - if filter_blocks(b): - b = filter_columns(b) - if b is not None: - bm[str(key(b))].append(maybe_copy(b)) - return bm - - def get_bool_data(self, **kwargs): - kwargs['is_bool'] = True - return self.get_data(**kwargs) - - def get_numeric_data(self, **kwargs): - kwargs['is_numeric'] = True - return self.get_data(**kwargs) - - def get_data(self, copy=False, columns=None, **kwargs): + def get_numeric_data(self, copy=False): """ Parameters ---------- copy : boolean, default False Whether to copy the blocks """ - blocks = self.get_block_map( - typ='list', copy=copy, columns=columns, **kwargs) - if len(blocks) == 0: - return self.make_empty() - - return self.combine(blocks, copy=copy) + self._consolidate_inplace() + return self.combine([b for b in self.blocks if b.is_numeric], copy) def combine(self, blocks, copy=True): """ return a new manager with the blocks """ + if len(blocks) == 0: + return self.make_empty() + indexer = np.sort(np.concatenate([b.ref_locs for b in blocks])) + inv_indexer = _invert_reordering(indexer) new_items = self.items.take(indexer) new_blocks = [] for b in blocks: - b = b.reindex_items_from(new_items, copy=copy) - new_blocks.extend(_valid_blocks(b)) + b = b.copy(deep=copy) + b._ref_locs = inv_indexer.take(b.ref_locs) + new_blocks.append(b) + new_axes = list(self.axes) new_axes[0] = new_items return self.__class__(new_blocks, new_axes, do_integrity_check=False) def get_slice(self, slobj, axis=0): new_axes = list(self.axes) - new_axes[axis] = new_axes[axis][slobj] if axis == 0: new_items = new_axes[0] # we want to preserver the view of a single-block - if len(self.blocks) == 1: - + if (len(self.blocks) == 1 and + (self.blocks[0]._ref_locs == np.arange(self.shape[0])).all()): blk = self.blocks[0] - ref_locs = blk.take_ref_locs(slobj) - newb = make_block(blk._slice(slobj), new_items, new_items, + newb = make_block(blk._slice(slobj), klass=blk.__class__, fastpath=True, - placement=ref_locs) + placement=np.arange(len(new_items))) new_blocks = [newb] else: - return self.reindex_items( - new_items, indexer=np.arange(len(self.items))[slobj]) + return self.reindex_indexer( + new_items, indexer=np.arange(len(self.items))[slobj], + axis=0, allow_dups=True) else: - new_blocks = self._slice_blocks(slobj, axis) + slicer = [slice(None)] * self.ndim + slicer[axis] = slobj + + new_blocks = [make_block(block._slice(slicer), + klass=block.__class__, + fastpath=True, + placement=block.ref_locs) + for block in self.blocks] bm = self.__class__(new_blocks, new_axes, do_integrity_check=False) bm._consolidate_inplace() return bm - def _slice_blocks(self, slobj, axis): - """ - slice the blocks using the provided slice object - this is only for slicing on axis != 0 - """ - - if axis == 0: - raise AssertionError("cannot _slice_blocks on axis=0") - - slicer = [slice(None, None) for _ in range(self.ndim)] - slicer[axis] = slobj - slicer = tuple(slicer) - is_unique = self.axes[0].is_unique - - def place(block): - if not is_unique: - return block._ref_locs - return None - - return [ make_block(block._slice(slicer), - block.items, - block.ref_items, - klass=block.__class__, - fastpath=True, - placement=place(block) - ) for block in self.blocks ] - - def get_series_dict(self): - # For DataFrame - return _blocks_to_series_dict(self.blocks, self.axes[1]) - def __contains__(self, item): return item in self.items @@ -2781,55 +2410,38 @@ def copy(self, deep=True): else: new_axes = list(self.axes) return self.apply('copy', axes=new_axes, deep=deep, - ref_items=new_axes[0], do_integrity_check=False) + do_integrity_check=False) def as_matrix(self, items=None): if len(self.blocks) == 0: - mat = np.empty(self.shape, dtype=float) - elif len(self.blocks) == 1: - blk = self.blocks[0] - if items is None or blk.items.equals(items): - # if not, then just call interleave per below - mat = blk.get_values() - else: - mat = self.reindex_items(items).as_matrix() + return np.empty(self.shape, dtype=float) + + if items is not None: + mgr = self.reindex_axis(items, axis=0) else: - if items is None: - mat = self._interleave(self.items) - else: - mat = self.reindex_items(items).as_matrix() + mgr = self - return mat + if (len(mgr.blocks) == 1 and + (mgr.blocks[0]._ref_locs is None or + (mgr.blocks[0]._ref_locs == np.arange(mgr.shape[0])).all())): + return mgr.blocks[0].get_values() + else: + return mgr._interleave() - def _interleave(self, items): + def _interleave(self): """ Return ndarray from blocks with specified item order Items must be contained in the blocks """ dtype = _interleaved_dtype(self.blocks) - items = _ensure_index(items) result = np.empty(self.shape, dtype=dtype) - itemmask = np.zeros(len(items), dtype=bool) - - # By construction, all of the item should be covered by one of the - # blocks - if items.is_unique: - - for block in self.blocks: - indexer = items.get_indexer(block.items) - if (indexer == -1).any(): - raise AssertionError('Items must contain all block items') - result[indexer] = block.get_values(dtype) - itemmask[indexer] = 1 - - else: + itemmask = np.zeros(self.shape[0]) - # non-unique, must use ref_locs - rl = self._set_ref_locs() - for i, (block, idx) in enumerate(rl): - result[i] = block.get_values(dtype)[idx] - itemmask[i] = 1 + for blk in self.blocks: + rl = blk.ref_locs + result[rl] = blk.get_values(dtype) + itemmask[rl] = 1 if not itemmask.all(): raise AssertionError('Some items were not contained in blocks') @@ -2863,22 +2475,17 @@ def xs(self, key, axis=1, copy=True, takeable=False): if len(self.blocks) > 1: # we must copy here as we are mixed type for blk in self.blocks: - newb = make_block(blk.values[slicer], - blk.items, - blk.ref_items, - klass=blk.__class__, - fastpath=True) + newb = make_block(values=blk.values[slicer], + klass=blk.__class__, fastpath=True, + placement=blk.ref_locs) new_blocks.append(newb) elif len(self.blocks) == 1: block = self.blocks[0] vals = block.values[slicer] if copy: vals = vals.copy() - new_blocks = [make_block(vals, - self.items, - self.items, - klass=block.__class__, - fastpath=True)] + new_blocks = [make_block(values=vals, placement=block.ref_locs, + klass=block.__class__, fastpath=True,)] return self.__class__(new_blocks, new_axes) @@ -2897,7 +2504,7 @@ def fast_xs(self, loc): # non-unique (GH4726) if not items.is_unique: - result = self._interleave(items) + result = self._interleave() if self.ndim == 2: result = result.T return result[loc] @@ -2907,9 +2514,10 @@ def fast_xs(self, loc): n = len(items) result = np.empty(n, dtype=dtype) for blk in self.blocks: - for j, item in enumerate(blk.items): - i = items.get_loc(item) - result[i] = blk._try_coerce_result(blk.iget((j, loc))) + # Such assignment may incorrectly coerce NaT to None + # result[blk.ref_locs] = blk._slice((slice(None), loc)) + for i, rl in enumerate(blk.ref_locs): + result[rl] = blk._try_coerce_result(blk.iget((i, loc))) return result @@ -2930,112 +2538,86 @@ def consolidate(self): def _consolidate_inplace(self): if not self.is_consolidated(): - self.blocks = _consolidate(self.blocks, self.items) - - # reset our mappings - if not self.items.is_unique: - self._ref_locs = None - self._items_map = None - self._set_ref_locs(do_refs=True) + self.blocks = _consolidate(self.blocks) self._is_consolidated = True self._known_consolidated = True self._set_has_sparse() + self._rebuild_ref_locs() def get(self, item): + """ + Return values for selected item (ndarray or BlockManager). + """ if self.items.is_unique: - if isnull(item): + if not isnull(item): + loc = self.items.get_loc(item) + else: indexer = np.arange(len(self.items))[isnull(self.items)] - return self.get_for_nan_indexer(indexer) - _, block = self._find_block(item) - return block.get(item) + # allow a single nan location indexer + if not np.isscalar(indexer): + if len(indexer) == 1: + loc = indexer.item() + else: + raise ValueError("cannot label index with a null key") + + return self.iget(loc) else: if isnull(item): raise ValueError("cannot label index with a null key") - indexer = self.items.get_loc(item) - ref_locs = np.array(self._set_ref_locs()) - - # duplicate index but only a single result - if com.is_integer(indexer): - - b, loc = ref_locs[indexer] - values = [b.iget(loc)] - index = Index([self.items[indexer]]) - - # we have a multiple result, potentially across blocks - else: - - values = [block.iget(i) for block, i in ref_locs[indexer]] - index = self.items[indexer] - - # create and return a new block manager - axes = [index] + self.axes[1:] - blocks = form_blocks(values, index, axes) - mgr = BlockManager(blocks, axes) - mgr._consolidate_inplace() - return mgr + indexer = self.items.get_indexer_for([item]) + return self.reindex_indexer(new_axis=self.items[indexer], + indexer=indexer, axis=0, allow_dups=True) def iget(self, i): - item = self.items[i] - - # unique - if self.items.is_unique: - if notnull(item): - return self.get(item) - return self.get_for_nan_indexer(i) - - ref_locs = self._set_ref_locs() - b, loc = ref_locs[i] - return b.iget(loc) - - def get_for_nan_indexer(self, indexer): - - # allow a single nan location indexer - if not np.isscalar(indexer): - if len(indexer) == 1: - indexer = indexer.item() - else: - raise ValueError("cannot label index with a null key") - - # take a nan indexer and return the values - ref_locs = self._set_ref_locs(do_refs='force') - b, loc = ref_locs[indexer] + b, loc = self._ref_locs[i] return b.iget(loc) def get_scalar(self, tup): """ Retrieve single item """ - item = tup[0] - _, blk = self._find_block(item) - - # this could obviously be seriously sped up in cython - item_loc = blk.items.get_loc(item), - full_loc = item_loc + tuple(ax.get_loc(x) - for ax, x in zip(self.axes[1:], tup[1:])) - return blk.values[full_loc] + full_loc = list(ax.get_loc(x) + for ax, x in zip(self.axes, tup)) + blk, blk_loc = self._ref_locs[full_loc[0]] + full_loc[0] = blk_loc + return blk.values[tuple(full_loc)] def delete(self, item): + """ + Delete selected item (items if non-unique) in-place. + """ + indexer = self.items.get_loc(item) - is_unique = self.items.is_unique - loc = self.items.get_loc(item) + is_deleted = np.zeros(self.shape[0], dtype=np.bool_) + is_deleted[indexer] = True + ref_loc_offset = is_deleted.cumsum() - # dupe keys may return mask - loc = _possibly_convert_to_indexer(loc) - self._delete_from_all_blocks(loc, item) + new_items = self.items[~is_deleted] + new_blocks = [] - # _ref_locs, and _items_map are good here - new_items = self.items.delete(loc) - self.set_items_norename(new_items) + for blk in self.blocks: + brl = blk.ref_locs + blk_del = is_deleted[brl] + blk_del_count = np.count_nonzero(blk_del) - self._known_consolidated = False + if blk_del_count == len(brl): + continue - if not is_unique: - self._consolidate_inplace() + blk._ref_locs -= ref_loc_offset[brl] + if blk_del_count != 0: + blk = blk._getitem_block(~blk_del) + + new_blocks.append(blk) + + self.axes[0] = new_items + self.blocks = new_blocks + self._shape = None + self._rebuild_ref_locs() def set(self, item, value, check=False): """ @@ -3043,508 +2625,284 @@ def set(self, item, value, check=False): contained in the current set of items if check, then validate that we are not setting the same data in-place """ - if not isinstance(value, SparseArray): + # FIXME: refactor, clearly separate broadcasting & zip-like assignment + is_sparse = isinstance(value, SparseArray) + + if is_sparse: + assert self.ndim == 2 + + def value_getitem(locs): + return value + else: if value.ndim == self.ndim - 1: value = value.reshape((1,) + value.shape) + + def value_getitem(locs): + return value + else: + def value_getitem(locs): + return value[locs] if value.shape[1:] != self.shape[1:]: raise AssertionError('Shape of new values must be compatible ' 'with manager shape') - def _set_item(item, arr): - i, block = self._find_block(item) - if not block.should_store(value): - # delete from block, create and append new block - self._delete_from_block(i, item) - self._add_new_block(item, arr, loc=None) - else: - block.set(item, arr, check=check) - try: - loc = self.items.get_loc(item) - if isinstance(loc, int): - _set_item(self.items[loc], value) - else: - subset = self.items[loc] - if len(value) != len(subset): - raise AssertionError( - 'Number of items to set did not match') - - # we are inserting multiple non-unique items as replacements - # we are inserting one by one, so the index can go from unique - # to non-unique during the loop, need to have _ref_locs defined - # at all times - if np.isscalar(item) and (com.is_list_like(loc) or isinstance(loc, slice)): - - # first delete from all blocks - self.delete(item) - - loc = _possibly_convert_to_indexer(loc) - for i, (l, k, arr) in enumerate(zip(loc, subset, value)): + except KeyError: + # This item wasn't present, just insert at end + self.insert(len(self.items), item, value) + return - # insert the item - self.insert( - l, k, arr[None, :], allow_duplicates=True) + if isinstance(loc, int): + loc = [loc] - # reset the _ref_locs on indiviual blocks - # rebuild ref_locs - if self.items.is_unique: - self._reset_ref_locs() - self._set_ref_locs(do_refs='force') + ref_locs = self._ref_locs[loc] - self._rebuild_ref_locs() + unfit_mgr_locs = [] + unfit_val_locs = [] + for blk, blk_locs, val_locs in ref_loc_groupby_block(ref_locs): + if blk.should_store(value): + blk.set(blk_locs, value_getitem(val_locs), check=check) + else: + unfit_mgr_locs.append(blk.ref_locs[blk_locs]) + unfit_val_locs.append(val_locs) + new_blk_ref_locs = np.delete(blk.ref_locs, blk_locs, axis=0) + new_blk_len = len(new_blk_ref_locs) + if not new_blk_len: + self.blocks.remove(blk) else: - for i, (item, arr) in enumerate(zip(subset, value)): - _set_item(item, arr[None, :]) - except KeyError: - # insert at end - self.insert(len(self.items), item, value) + blk.values = np.delete(blk.values, blk_locs, axis=0) + blk._ref_locs = new_blk_ref_locs + self._ref_locs[new_blk_ref_locs] = \ + lib.fast_zip([np.array([blk] * new_blk_len), + np.arange(new_blk_len)]) + + if unfit_val_locs: + unfit_val_locs = np.concatenate(unfit_val_locs) + unfit_mgr_locs = np.concatenate(unfit_mgr_locs) + unfit_count = len(unfit_val_locs) + + if is_sparse: + for mgr_loc in unfit_mgr_locs: + new_block = make_block(values=value.copy(), + ndim=self.ndim, + placement=[mgr_loc]) + self.blocks.append(new_block) + self._ref_locs[mgr_loc] = (new_block, 0) + else: + new_block = make_block(values=value[unfit_val_locs], + ndim=self.ndim, + placement=unfit_mgr_locs) - self._known_consolidated = False + self.blocks.append(new_block) + self._ref_locs[unfit_mgr_locs] = lib.fast_zip([ + np.array([new_block] * unfit_count, dtype=np.object_), + np.arange(unfit_count)]) + + # Newly created block's dtype may already be present. + self._known_consolidated = False def insert(self, loc, item, value, allow_duplicates=False): + """ + Insert item at selected position. + Parameters + ---------- + loc : int + item : hashable + value : array_like + allow_duplicates: bool + If False, trying to insert non-unique item will raise + + """ if not allow_duplicates and item in self.items: # Should this be a different kind of error?? raise ValueError('cannot insert %s, already exists' % item) - try: - new_items = self.items.insert(loc, item) - self.set_items_norename(new_items) + if not isinstance(loc, int): + raise TypeError("loc must be int") - # new block - self._add_new_block(item, value, loc=loc) + new_items = self.items.insert(loc, item) + block = make_block(values=value, + ndim=self.ndim, + placement=[loc]) + new_ref_locs = np.insert(self._ref_locs, loc, None, axis=0) + new_ref_locs[loc] = (block, 0) - except: + for blk in self.blocks: + blk._ref_locs[blk._ref_locs >= loc] += 1 - # so our insertion operation failed, so back out of the new items - # GH 3010 - new_items = self.items.delete(loc) - self.set_items_norename(new_items) + self.blocks.append(block) + self.axes[0] = new_items + self._shape = None + self._ref_locs = new_ref_locs - # re-raise - raise + self._known_consolidated = False if len(self.blocks) > 100: self._consolidate_inplace() - self._known_consolidated = False - - # clear the internal ref_loc mappings if necessary - if loc != len(self.items) - 1 and new_items.is_unique: - self.set_items_clear(new_items) - - def set_items_norename(self, value): - self.set_axis(0, value, maybe_rename=False, check_axis=False) - self._shape = None - - def set_items_clear(self, value): - """ clear the ref_locs on all blocks """ - self.set_axis(0, value, maybe_rename='clear', check_axis=False) - - def _delete_from_all_blocks(self, loc, item): - """ delete from the items loc the item - the item could be in multiple blocks which could - change each iteration (as we split blocks) """ - - # possibily convert to an indexer - loc = _possibly_convert_to_indexer(loc) - - if isinstance(loc, (list, tuple, np.ndarray)): - for l in loc: - for i, b in enumerate(self.blocks): - if item in b.items: - self._delete_from_block(i, item) - - else: - i, _ = self._find_block(item) - self._delete_from_block(i, item) - - def _delete_from_block(self, i, item): - """ - Delete and maybe remove the whole block - - Remap the split blocks to there old ranges, - so after this function, _ref_locs and _items_map (if used) - are correct for the items, None fills holes in _ref_locs - """ - block = self.blocks.pop(i) - ref_locs = self._set_ref_locs() - prev_items_map = self._items_map.pop( - block) if ref_locs is not None else None - - # if we can't consolidate, then we are removing this block in its - # entirey - if block._can_consolidate: - - # compute the split mask - loc = block.items.get_loc(item) - if type(loc) == slice or com.is_integer(loc): - mask = np.array([True] * len(block)) - mask[loc] = False - else: # already a mask, inverted - mask = -loc - - # split the block - counter = 0 - for s, e in com.split_ranges(mask): - - sblock = make_block(block.values[s:e], - block.items[s:e].copy(), - block.ref_items, - klass=block.__class__, - fastpath=True) - - self.blocks.append(sblock) - - # update the _ref_locs/_items_map - if ref_locs is not None: - - # fill the item_map out for this sub-block - m = maybe_create_block_in_items_map( - self._items_map, sblock) - for j, itm in enumerate(sblock.items): - - # is this item masked (e.g. was deleted)? - while (True): - - if counter > len(mask) or mask[counter]: - break - else: - counter += 1 - - # find my mapping location - m[j] = prev_items_map[counter] - counter += 1 - - # set the ref_locs in this block - sblock.set_ref_locs(m) + def reindex_axis(self, new_axis, axis, method=None, limit=None, + fill_value=None, copy=True): + mgr = self if not copy else self.copy(deep=True) - # reset the ref_locs to the new structure - if ref_locs is not None: - - # items_map is now good, with the original locations - self._set_ref_locs(do_refs=True) - - # reset the ref_locs based on the now good block._ref_locs - self._reset_ref_locs() - - def _add_new_block(self, item, value, loc=None): - # Do we care about dtype at the moment? - - # hm, elaborate hack? - if loc is None: - loc = self.items.get_loc(item) - new_block = make_block(value, self.items[loc:loc + 1].copy(), - self.items, fastpath=True) - self.blocks.append(new_block) - - # set ref_locs based on the this new block - # and add to the ref/items maps - if not self.items.is_unique: - - # insert into the ref_locs at the appropriate location - # _ref_locs is already long enough, - # but may need to shift elements - new_block.set_ref_locs([0]) - - # need to shift elements to the right - if self._ref_locs[loc] is not None: - for i in reversed(lrange(loc + 1, len(self._ref_locs))): - self._ref_locs[i] = self._ref_locs[i - 1] - - self._ref_locs[loc] = (new_block, 0) - - # and reset - self._reset_ref_locs() - self._set_ref_locs(do_refs=True) - - def _find_block(self, item): - self._check_have(item) - for i, block in enumerate(self.blocks): - if item in block: - return i, block - - def _check_have(self, item): - if item not in self.items: - raise KeyError('no item named %s' % com.pprint_thing(item)) - - def reindex_axis(self, new_axis, indexer=None, method=None, axis=0, - fill_value=None, limit=None, copy=True): new_axis = _ensure_index(new_axis) - cur_axis = self.axes[axis] - - if new_axis.equals(cur_axis): - if copy: - result = self.copy(deep=True) - result.axes[axis] = new_axis - result._shape = None - - if axis == 0: - # patch ref_items, #1823 - for blk in result.blocks: - blk.ref_items = new_axis - - return result - else: - return self - - if axis == 0: - if method is not None or limit is not None: - return self.reindex_axis0_with_method( - new_axis, indexer=indexer, method=method, - fill_value=fill_value, limit=limit, copy=copy - ) - return self.reindex_items(new_axis, indexer=indexer, copy=copy, - fill_value=fill_value) + new_axis, indexer = mgr.axes[axis].reindex( + new_axis, method=method, limit=limit, copy_if_needed=True) - new_axis, indexer = cur_axis.reindex( - new_axis, method, copy_if_needed=True) - return self.reindex_indexer(new_axis, indexer, axis=axis, - fill_value=fill_value) + return mgr.reindex_indexer(new_axis, indexer, axis=axis, + fill_value=fill_value) - def reindex_axis0_with_method(self, new_axis, indexer=None, method=None, - fill_value=None, limit=None, copy=True): - raise AssertionError('method argument not supported for ' - 'axis == 0') - - def reindex_indexer(self, new_axis, indexer, axis=1, fill_value=None, + def reindex_indexer(self, new_axis, indexer, axis, fill_value=None, allow_dups=False): """ pandas-indexer with -1's only. """ # trying to reindex on an axis with duplicates - if not allow_dups and not self.axes[axis].is_unique and len(indexer): + if (not allow_dups and not self.axes[axis].is_unique + and indexer is not None and len(indexer)): raise ValueError("cannot reindex from a duplicate axis") - if not self.is_consolidated(): - self = self.consolidate() - - if axis == 0: - return self._reindex_indexer_items(new_axis, indexer, fill_value) - - new_blocks = [] - for block in self.blocks: - newb = block.reindex_axis( - indexer, axis=axis, fill_value=fill_value) - new_blocks.append(newb) - - new_axes = list(self.axes) - new_axes[axis] = new_axis - return self.__class__(new_blocks, new_axes) - - def _reindex_indexer_items(self, new_items, indexer, fill_value): - # TODO: less efficient than I'd like - - item_order = com.take_1d(self.items.values, indexer) - new_axes = [new_items] + self.axes[1:] - new_blocks = [] - is_unique = new_items.is_unique - - # we have duplicates in the items and what we are reindexing - if not is_unique and not self.items.is_unique: + if axis >= self.ndim: + raise AssertionError("Requested axis not found in manager") - rl = self._set_ref_locs(do_refs='force') - for i, idx in enumerate(indexer): - item = new_items.take([i]) - if idx >= 0: - blk, lidx = rl[idx] - blk = make_block(_block_shape(blk.iget(lidx)), item, - new_items, ndim=self.ndim, fastpath=True, - placement=[i]) - - # a missing value - else: - blk = self._make_na_block(item, - new_items, - placement=[i], - fill_value=fill_value) - new_blocks.append(blk) - new_blocks = _consolidate(new_blocks, new_items) + # FIXME: this code comes from generic.py, see if any of that is needed + # elif (baxis == 0 and + # index is not new_data.axes[baxis]): + # new_data = new_data.reindex_items(index, copy=copy, + # fill_value=fill_value) + # elif (baxis > 0 and index is not None and + # index is not new_data.axes[baxis]): + # new_data = new_data.copy(deep=copy) + # new_data.set_axis(baxis, index) - # keep track of what items aren't found anywhere + if axis == 0: + new_blocks = self._get_blocks_for_items_indexer(indexer, + fill_value) else: - l = np.arange(len(item_order)) - mask = np.zeros(len(item_order), dtype=bool) - - for blk in self.blocks: - blk_indexer = blk.items.get_indexer(item_order) - selector = blk_indexer != -1 - - # update with observed items - mask |= selector - - if not selector.any(): - continue - - new_block_items = new_items.take(selector.nonzero()[0]) - new_values = com.take_nd(blk.values, blk_indexer[selector], axis=0, - allow_fill=False) - placement = l[selector] if not is_unique else None - new_blocks.append(make_block(new_values, - new_block_items, - new_items, - placement=placement, - fastpath=True)) - - if not mask.all(): - na_items = new_items[-mask] - placement = l[-mask] if not is_unique else None - na_block = self._make_na_block(na_items, - new_items, - placement=placement, - fill_value=fill_value) - new_blocks.append(na_block) - new_blocks = _consolidate(new_blocks, new_items) + # TODO: is this faster than blk.reindex_axis? + # return self.apply('take', + # axes=new_axes, + # indexer=indexer, + # ref_items=new_axes[0], + # new_axis=new_axes[axis], + # axis=axis) + new_blocks = [blk.reindex_axis(indexer, axis=axis, + fill_value=fill_value) + for blk in self.blocks] + new_axes = list(self.axes) + new_axes[axis] = new_axis return self.__class__(new_blocks, new_axes) - def reindex_items(self, new_items, indexer=None, copy=True, - fill_value=None): - """ - + def _get_blocks_for_items_indexer(self, indexer, fill_value): """ - new_items = _ensure_index(new_items) - data = self - if not data.is_consolidated(): - data = data.consolidate() - return data.reindex_items(new_items, copy=copy, - fill_value=fill_value) + Reindex blocks at axis=0 (overloaded for SingleBlockManager). - if indexer is None: - new_items, indexer = self.items.reindex(new_items, - copy_if_needed=True) - new_axes = [new_items] + self.axes[1:] + Returns + ------- + new_blocks : list of Block - # could have so me pathological (MultiIndex) issues here + """ + # fill_value[0] == None will group soon-to-be-added items under None + # fill_value[1] is an arbitrary integer (it's ignored) + new_ref_locs = com.take_1d(self._ref_locs, indexer, + fill_value=(None, 0)) new_blocks = [] - if indexer is None: - for blk in self.blocks: - if copy: - blk = blk.reindex_items_from(new_items) - else: - blk.ref_items = new_items - new_blocks.extend(_valid_blocks(blk)) - else: - - # unique - if self.axes[0].is_unique and new_items.is_unique: - - # ok to use the global indexer if only 1 block - i = indexer if len(self.blocks) == 1 else None - - for block in self.blocks: - blk = block.reindex_items_from(new_items, indexer=i, copy=copy) - new_blocks.extend(_valid_blocks(blk)) - - # non-unique + for blk, blk_locs, mgr_locs in ref_loc_groupby_block(new_ref_locs): + if blk is None: + new_blocks.append(self._make_na_block( + placement=mgr_locs, fill_value=fill_value)) else: - rl = self._set_ref_locs(do_refs='force') - for i, idx in enumerate(indexer): - blk, lidx = rl[idx] - item = new_items.take([i]) - blk = make_block(_block_shape(blk.iget(lidx)), item, - new_items, ndim=self.ndim, fastpath=True, - placement=[i]) - new_blocks.append(blk) - - # add a na block if we are missing items - mask = indexer == -1 - if mask.any(): - extra_items = new_items[mask] - na_block = self._make_na_block(extra_items, new_items, - fill_value=fill_value) - new_blocks.append(na_block) - new_blocks = _consolidate(new_blocks, new_items) - - # consolidate - # import for non-unique which creates a block for each item - # and they must be consolidated before passing on - new_blocks = _consolidate(new_blocks, new_items) - - return self.__class__(new_blocks, new_axes) + # Otherwise, slicing along items axis is necessary. + if blk.is_sparse: + # If it's a sparse block, it's easy: + # + # - it can only contain 1 item + # - if blk is here, the item wasn't deleted + # - if blk wasn't handled above, the item is multiplied + # + # Hence the block is replicated. + for mgr_loc in mgr_locs: + newblk = blk.copy(deep=True) + newblk._ref_locs = np.array([mgr_loc]) + new_blocks.append(newblk) - def _make_na_block(self, items, ref_items, placement=None, - fill_value=None): + else: + # FIXME: this hack makes sure post-reindex blocks enumerate + # manager locs in ascending order. It was implemented to + # make pytables serialization test happy and should be + # removed once the codebase successfully switches to + # axis-oblivious blocks & blockmanagers. + order = np.argsort(mgr_locs) + blk_locs = blk_locs.take(order) + mgr_locs = mgr_locs.take(order) + + new_values = com.take_1d(blk.values, blk_locs, + axis=0, allow_fill=False) + newblk = blk.__class__(values=new_values, + ndim=blk.ndim, + fastpath=True, + placement=mgr_locs,) + new_blocks.append(newblk) + + return new_blocks + + def _make_na_block(self, placement, fill_value=None): # TODO: infer dtypes other than float64 from fill_value if fill_value is None: fill_value = np.nan block_shape = list(self.shape) - block_shape[0] = len(items) + block_shape[0] = len(placement) dtype, fill_value = com._infer_dtype_from_scalar(fill_value) block_values = np.empty(block_shape, dtype=dtype) block_values.fill(fill_value) - return make_block(block_values, items, ref_items, placement=placement) - - def take(self, indexer, new_index=None, axis=1, verify=True): - if axis < 1: - raise AssertionError('axis must be at least 1, got %d' % axis) + return make_block(block_values, placement=placement) + def take(self, indexer, axis=1, verify=True, convert=True): + """ + Take items along any axis. + """ self._consolidate_inplace() - if isinstance(indexer, list): - indexer = np.array(indexer) + indexer = np.asanyarray(indexer, dtype=np.int_) - indexer = com._ensure_platform_int(indexer) - n = len(self.axes[axis]) + n = self.shape[axis] + if convert: + indexer = _maybe_convert_indices(indexer, n) if verify: - indexer = _maybe_convert_indices(indexer, n) if ((indexer == -1) | (indexer >= n)).any(): raise Exception('Indices must be nonzero and less than ' 'the axis length') - new_axes = list(self.axes) - if new_index is None: - new_index = self.axes[axis].take(indexer) - - new_axes[axis] = new_index - return self.apply('take', - axes=new_axes, - indexer=indexer, - ref_items=new_axes[0], - new_axis=new_axes[axis], - axis=axis) - - def merge(self, other, lsuffix=None, rsuffix=None): + new_labels = self.axes[axis].take(indexer) + return self.reindex_indexer(new_axis=new_labels, indexer=indexer, + axis=axis, allow_dups=True) + + def merge(self, other, lsuffix='', rsuffix=''): if not self._is_indexed_like(other): raise AssertionError('Must have same axes to merge managers') - this, other = self._maybe_rename_join(other, lsuffix, rsuffix) - - cons_items = this.items + other.items - new_axes = list(this.axes) - new_axes[0] = cons_items + l, r = items_overlap_with_suffix(left=self.items, lsuffix=lsuffix, + right=other.items, rsuffix=rsuffix) + new_items = _concat_indexes([l, r]) - consolidated = _consolidate(this.blocks + other.blocks, cons_items) - return self.__class__(consolidated, new_axes) - - def _maybe_rename_join(self, other, lsuffix, rsuffix, copydata=True): - to_rename = self.items.intersection(other.items) - if len(to_rename) > 0: - if not lsuffix and not rsuffix: - raise ValueError('columns overlap but no suffix specified: %s' - % to_rename) - - def lrenamer(x): - if x in to_rename: - return '%s%s' % (x, lsuffix) - return x - - def rrenamer(x): - if x in to_rename: - return '%s%s' % (x, rsuffix) - return x + new_blocks = [] + for blocks, offset in [(self.blocks, 0), + (other.blocks, self.shape[0])]: + for blk in blocks: + blk = blk.copy(deep=False) + blk._ref_locs += offset + new_blocks.append(blk) - this = self.rename_items(lrenamer, copy=copydata) - other = other.rename_items(rrenamer, copy=copydata) - else: - this = self + new_axes = list(self.axes) + new_axes[0] = new_items - return this, other + return self.__class__(_consolidate(new_blocks), new_axes) def _is_indexed_like(self, other): """ @@ -3558,82 +2916,38 @@ def _is_indexed_like(self, other): return False return True - def rename(self, mapper, axis, copy=False): - """ generic rename """ + def rename_axis(self, mapper, axis, copy=True): + """ + Rename one of axes. - if axis == 0: - return self.rename_items(mapper, copy=copy) - return self.rename_axis(mapper, axis=axis) + Parameters + ---------- + mapper : unary callable + axis : int + copy : boolean, default True - def rename_axis(self, mapper, axis=1): + """ + new_axis = _transform_index(self.axes[axis], mapper) - index = self.axes[axis] - if isinstance(index, MultiIndex): - new_axis = MultiIndex.from_tuples( - [tuple(mapper(y) for y in x) for x in index], - names=index.names) + if axis != 0: + new_blocks = self.blocks else: - new_axis = Index([mapper(x) for x in index], name=index.name) - - if not new_axis.is_unique: - raise AssertionError('New axis must be unique to rename') + new_blocks = [] + for block in self.blocks: + newb = block.copy(deep=copy) + new_blocks.append(newb) new_axes = list(self.axes) new_axes[axis] = new_axis - return self.__class__(self.blocks, new_axes) - - def rename_items(self, mapper, copy=True): - if isinstance(self.items, MultiIndex): - items = [tuple(mapper(y) for y in x) for x in self.items] - new_items = MultiIndex.from_tuples(items, names=self.items.names) - else: - items = [mapper(x) for x in self.items] - new_items = Index(items, name=self.items.name) - - new_blocks = [] - for block in self.blocks: - newb = block.copy(deep=copy) - newb.set_ref_items(new_items, maybe_rename=True) - new_blocks.append(newb) - new_axes = list(self.axes) - new_axes[0] = new_items return self.__class__(new_blocks, new_axes) def add_prefix(self, prefix): f = (('%s' % prefix) + '%s').__mod__ - return self.rename_items(f) + return self.rename_axis(f, axis=0) def add_suffix(self, suffix): f = ('%s' + ('%s' % suffix)).__mod__ - return self.rename_items(f) - - @property - def block_id_vector(self): - # TODO - result = np.empty(len(self.items), dtype=int) - result.fill(-1) - - for i, blk in enumerate(self.blocks): - indexer = self.items.get_indexer(blk.items) - if (indexer == -1).any(): - raise AssertionError('Block items must be in manager items') - result.put(indexer, i) - - if (result < 0).any(): - raise AssertionError('Some items were not in any block') - return result - - @property - def item_dtypes(self): - result = np.empty(len(self.items), dtype='O') - mask = np.zeros(len(self.items), dtype=bool) - for i, blk in enumerate(self.blocks): - indexer = self.items.get_indexer(blk.items) - result.put(indexer, blk.dtype.name) - mask.put(indexer, 1) - if not (mask.all()): - raise AssertionError('Some items were not in any block') - return result + return self.rename_axis(f, axis=0) def equals(self, other): self_axes, other_axes = self.axes, other.axes @@ -3646,14 +2960,24 @@ def equals(self, other): return all(block.equals(oblock) for block, oblock in zip(self.blocks, other.blocks)) + def group_blocks_by_ftype(self): + """ + Combine blocks into map: ftype -> [blk0, blk1, ...]. + + """ + bm = defaultdict(list) + for b in self.blocks: + bm[str(b.ftype)].append(b) + return bm + + class SingleBlockManager(BlockManager): """ manage a single block with """ ndim = 1 _is_consolidated = True _known_consolidated = True - __slots__ = ['axes', 'blocks', '_block', - '_values', '_shape', '_has_sparse'] + __slots__ = ['axes', 'blocks'] def __init__(self, block, axis, do_integrity_check=False, fastpath=True): @@ -3676,7 +3000,8 @@ def __init__(self, block, axis, do_integrity_check=False, fastpath=True): 'more than 1 block') block = block[0] if not isinstance(block, Block): - block = make_block(block, axis, axis, ndim=1, fastpath=True) + block = make_block(block, ndim=1, fastpath=True, + placement=np.arange(len(axis))) else: @@ -3689,7 +3014,7 @@ def __init__(self, block, axis, do_integrity_check=False, fastpath=True): if len(block) > 1: dtype = _interleaved_dtype(block) block = [b.astype(dtype) for b in block] - block = _consolidate(block, axis) + block = _consolidate(block) if len(block) != 1: raise ValueError('Cannot create SingleBlockManager with ' @@ -3697,48 +3022,81 @@ def __init__(self, block, axis, do_integrity_check=False, fastpath=True): block = block[0] if not isinstance(block, Block): - block = make_block(block, axis, axis, ndim=1, fastpath=True) + block = make_block(block, axis, ndim=1, + fastpath=True, placement=None) self.blocks = [block] - self._block = self.blocks[0] - self._values = self._block.values - self._has_sparse = self._block.is_sparse def _post_setstate(self): - self._block = self.blocks[0] - self._values = self._block.values + pass - def _get_counts(self, f): - return { f(self._block) : 1 } + @property + def _block(self): + return self.blocks[0] @property - def shape(self): - if getattr(self, '_shape', None) is None: - self._shape = tuple([len(self.axes[0])]) - return self._shape + def _values(self): + return self._block.values - def apply(self, f, axes=None, do_integrity_check=False, **kwargs): - """ - fast path for SingleBlock Manager + @property + def _has_sparse(self): + return self._block.is_sparse - ssee also BlockManager.apply - """ - applied = getattr(self._block, f)(**kwargs) - bm = self.__class__(applied, axes or self.axes, - do_integrity_check=do_integrity_check) - bm._consolidate_inplace() - return bm + def _set_has_sparse(self): + # _has_sparse is a property, nothing to set here + pass + + # def apply(self, f, axes=None, do_integrity_check=False, **kwargs): + # """ + # fast path for SingleBlock Manager + + # ssee also BlockManager.apply + # """ + # applied = getattr(self._block, f)(**kwargs) + # bm = self.__class__(applied, axes or self.axes, + # do_integrity_check=do_integrity_check) + # bm._consolidate_inplace() + # return bm def reindex(self, new_axis, indexer=None, method=None, fill_value=None, limit=None, copy=True): # if we are the same and don't copy, just return - if not copy and self.index.equals(new_axis): - return self + if self.index.equals(new_axis): + if copy: + return self.copy(deep=True) + else: + return self - block = self._block.reindex_items_from(new_axis, indexer=indexer, - method=method, - fill_value=fill_value, - limit=limit, copy=copy) + values = self._block.get_values() + + if indexer is None: + indexer = self.items.get_indexer_for(new_axis) + + if fill_value is None: + # FIXME: is fill_value used correctly in sparse blocks? + if not self._block.is_sparse: + fill_value = self._block.fill_value + else: + fill_value = np.nan + + new_values = com.take_1d(values, indexer, + fill_value=fill_value) + + # fill if needed + if method is not None or limit is not None: + new_values = com.interpolate_2d(new_values, method=method, + limit=limit, fill_value=fill_value) + + if self._block.is_sparse: + make_block = self._block.make_block + + block = make_block(new_values, copy=copy, + placement=np.arange(len(new_axis))) + + # block = self._block.reindex_items_from(new_axis, indexer=indexer, + # method=method, + # fill_value=fill_value, + # limit=limit, copy=copy) mgr = SingleBlockManager(block, new_axis) mgr._consolidate_inplace() return mgr @@ -3748,45 +3106,29 @@ def _reindex_indexer_items(self, new_items, indexer, fill_value): return self.reindex(new_items, indexer=indexer, fill_value=fill_value, copy=False) - def reindex_axis0_with_method(self, new_axis, indexer=None, method=None, - fill_value=None, limit=None, copy=True): - return self.reindex(new_axis, indexer=indexer, method=method, - fill_value=fill_value, limit=limit, copy=copy) - def _delete_from_block(self, i, item): super(SingleBlockManager, self)._delete_from_block(i, item) # possibly need to merge split blocks if len(self.blocks) > 1: - new_items = Index(list(itertools.chain(*[ b.items for b in self.blocks ]))) - block = make_block(np.concatenate([ b.values for b in self.blocks ]), - new_items, - new_items, - dtype=self._block.dtype) + new_values = np.concatenate([b.values for b in self.blocks]) + new_items = Index(np.concatenate([b.items for b in self.blocks])) + + block = make_block(values=new_values, placement=None, + dtype=self._block.dtype,) elif len(self.blocks): block = self.blocks[0] else: - block = make_block(np.array([], dtype=self._block.dtype), [], []) + block = make_block(values=np.array([], dtype=self._block.dtype), + placement=None) self.blocks = [block] - self._block = block - self._values = self._block.values def get_slice(self, slobj): return self.__class__(self._block._slice(slobj), self.index[slobj], fastpath=True) - def set_axis(self, axis, value, maybe_rename=True, check_axis=True): - cur_axis, value = self._set_axis(axis, value, check_axis) - self._block.set_ref_items(self.items, maybe_rename=maybe_rename) - - def set_ref_items(self, ref_items, maybe_rename=True): - """ we can optimize and our ref_locs are always equal to ref_items """ - if maybe_rename: - self.items = ref_items - self.ref_items = ref_items - @property def index(self): return self.axes[0] @@ -3804,6 +3146,18 @@ def dtype(self): def ftype(self): return self._block.ftype + def get_dtype_counts(self): + return {self.dtype.name: 1} + + def get_ftype_counts(self): + return {self.ftype: 1} + + def get_dtypes(self): + return [self._block.dtype] + + def get_ftypes(self): + return [self._block.ftype] + @property def values(self): return self._values.view() @@ -3825,6 +3179,22 @@ def _consolidate_check(self): def _consolidate_inplace(self): pass + def delete(self, item): + """ + Delete single item from SingleBlockManager. + + Ensures that self.blocks doesn't become empty. + """ + # Also, make sure dtype is preserved. + dtype = self._block.dtype + + super(SingleBlockManager, self).delete(item) + + if not self.blocks: + self.blocks = [make_block(values=np.empty(0, dtype=dtype), + placement=np.arange(len(self.items)), + ndim=1, dtype=dtype, fastpath=True)] + def fast_xs(self, loc): """ fast path for getting a cross-section @@ -3832,6 +3202,26 @@ def fast_xs(self, loc): """ return self._block.values[loc] + def _get_blocks_for_items_indexer(self, indexer, fill_value): + """ + Reindex blocks at axis=0 (overloaded for SingleBlockManager). + + Returns + ------- + new_blocks : list of Block + + """ + if indexer is None: + new_values = self._values.copy() + else: + new_values = com.take_1d(self._values, indexer, + fill_value=fill_value) + + return [make_block(values=new_values, + placement=np.arange(len(new_values)), + ndim=self.ndim, fastpath=True)] + + def construction_error(tot_items, block_shape, axes, e=None): """ raise a helpful message about our construction """ passed = tuple(map(int, [tot_items] + list(block_shape))) @@ -3841,14 +3231,15 @@ def construction_error(tot_items, block_shape, axes, e=None): raise ValueError("Shape of passed values is {0}, indices imply {1}".format( passed,implied)) + def create_block_manager_from_blocks(blocks, axes): try: - - # if we are passed values, make the blocks if len(blocks) == 1 and not isinstance(blocks[0], Block): - placement = None if axes[0].is_unique else np.arange(len(axes[0])) - blocks = [ - make_block(blocks[0], axes[0], axes[0], placement=placement)] + # It's OK if a single block is passed as values, its placement is + # basically "all items", but if there're many, don't bother + # converting, it's an error anyway. + blocks = [make_block(values=blocks[0], + placement=np.arange(len(axes[0])),)] mgr = BlockManager(blocks, axes) mgr._consolidate_inplace() @@ -3870,26 +3261,7 @@ def create_block_manager_from_arrays(arrays, names, axes): construction_error(len(arrays), arrays[0].shape[1:], axes, e) -def maybe_create_block_in_items_map(im, block): - """ create/return the block in an items_map """ - try: - return im[block] - except: - im[block] = l = [None] * len(block.items) - return l - - def form_blocks(arrays, names, axes): - - # pre-filter out items if we passed it - items = axes[0] - - if len(arrays) < len(items): - nn = set(names) - extra_items = Index([i for i in items if i not in nn]) - else: - extra_items = [] - # put "leftover" items in float bucket, where else? # generalize? float_items = [] @@ -3899,8 +3271,23 @@ def form_blocks(arrays, names, axes): object_items = [] sparse_items = [] datetime_items = [] + extra_locs = [] + + names_idx = Index(names) + if names_idx.equals(axes[0]): + names_indexer = np.arange(len(names_idx)) + else: + assert names_idx.intersection(axes[0]).is_unique + names_indexer = names_idx.get_indexer_for(axes[0]) + + for i, name_idx in enumerate(names_indexer): + if name_idx == -1: + extra_locs.append(i) + continue + + k = names[name_idx] + v = arrays[name_idx] - for i, (k, v) in enumerate(zip(names, arrays)): if isinstance(v, (SparseArray, ABCSparseSeries)): sparse_items.append((i, k, v)) elif issubclass(v.dtype.type, np.floating): @@ -3927,72 +3314,67 @@ def form_blocks(arrays, names, axes): else: object_items.append((i, k, v)) - is_unique = items.is_unique blocks = [] if len(float_items): - float_blocks = _multi_blockify(float_items, items, is_unique=is_unique) + float_blocks = _multi_blockify(float_items) blocks.extend(float_blocks) if len(complex_items): complex_blocks = _simple_blockify( - complex_items, items, np.complex128, is_unique=is_unique) + complex_items, np.complex128) blocks.extend(complex_blocks) if len(int_items): - int_blocks = _multi_blockify(int_items, items, is_unique=is_unique) + int_blocks = _multi_blockify(int_items) blocks.extend(int_blocks) if len(datetime_items): datetime_blocks = _simple_blockify( - datetime_items, items, _NS_DTYPE, is_unique=is_unique) + datetime_items, _NS_DTYPE) blocks.extend(datetime_blocks) if len(bool_items): bool_blocks = _simple_blockify( - bool_items, items, np.bool_, is_unique=is_unique) + bool_items, np.bool_) blocks.extend(bool_blocks) if len(object_items) > 0: object_blocks = _simple_blockify( - object_items, items, np.object_, is_unique=is_unique) + object_items, np.object_) blocks.extend(object_blocks) if len(sparse_items) > 0: - sparse_blocks = _sparse_blockify(sparse_items, items) + sparse_blocks = _sparse_blockify(sparse_items) blocks.extend(sparse_blocks) - if len(extra_items): - shape = (len(extra_items),) + tuple(len(x) for x in axes[1:]) + if len(extra_locs): + shape = (len(extra_locs),) + tuple(len(x) for x in axes[1:]) # empty items -> dtype object block_values = np.empty(shape, dtype=object) block_values.fill(np.nan) - placement = None if is_unique else np.arange(len(extra_items)) - na_block = make_block( - block_values, extra_items, items, placement=placement) + na_block = make_block(block_values, placement=extra_locs) blocks.append(na_block) return blocks -def _simple_blockify(tuples, ref_items, dtype, is_unique=True): +def _simple_blockify(tuples, dtype): """ return a single array of a block that has a single dtype; if dtype is not None, coerce to this dtype """ - block_items, values, placement = _stack_arrays(tuples, ref_items, dtype) + values, placement = _stack_arrays(tuples, dtype) # CHECK DTYPE? if dtype is not None and values.dtype != dtype: # pragma: no cover values = values.astype(dtype) - if is_unique: - placement = None - block = make_block(values, block_items, ref_items, placement=placement) + block = make_block(values, placement=placement) return [block] -def _multi_blockify(tuples, ref_items, dtype=None, is_unique=True): +def _multi_blockify(tuples, dtype=None): """ return an array of blocks that potentially have different dtypes """ # group by dtype @@ -4001,37 +3383,32 @@ def _multi_blockify(tuples, ref_items, dtype=None, is_unique=True): new_blocks = [] for dtype, tup_block in grouper: - block_items, values, placement = _stack_arrays( - list(tup_block), ref_items, dtype) - if is_unique: - placement = None - block = make_block(values, block_items, ref_items, placement=placement) + values, placement = _stack_arrays( + list(tup_block), dtype) + + block = make_block(values, placement=placement) new_blocks.append(block) return new_blocks -def _sparse_blockify(tuples, ref_items, dtype=None): +def _sparse_blockify(tuples, dtype=None): """ return an array of blocks that potentially have different dtypes (and are sparse) """ new_blocks = [] for i, names, array in tuples: - - if not isinstance(names, (list, tuple)): - names = [names] - items = ref_items[ref_items.isin(names)] - array = _maybe_to_sparse(array) block = make_block( - array, items, ref_items, klass=SparseBlock, fastpath=True) + array, klass=SparseBlock, fastpath=True, + placement=[i]) new_blocks.append(block) return new_blocks -def _stack_arrays(tuples, ref_items, dtype): +def _stack_arrays(tuples, dtype): # fml def _asarray_compat(x): @@ -4055,33 +3432,7 @@ def _shape_compat(x): for i, arr in enumerate(arrays): stacked[i] = _asarray_compat(arr) - # index may box values - if ref_items.is_unique: - items = ref_items[ref_items.isin(names)] - else: - # a mi - if isinstance(ref_items, MultiIndex): - names = MultiIndex.from_tuples(names) - items = ref_items[ref_items.isin(names)] - - # plain old dups - else: - items = _ensure_index([n for n in names if n in ref_items]) - if len(items) != len(stacked): - raise ValueError("invalid names passed _stack_arrays") - - return items, stacked, placement - - -def _blocks_to_series_dict(blocks, index=None): - from pandas.core.series import Series - - series_dict = {} - - for block in blocks: - for item, vec in zip(block.items, block.values): - series_dict[item] = Series(vec, index=index, name=item) - return series_dict + return stacked, placement def _interleaved_dtype(blocks): @@ -4143,7 +3494,7 @@ def _lcd_dtype(l): return _lcd_dtype(counts[FloatBlock] + counts[SparseBlock]) -def _consolidate(blocks, items): +def _consolidate(blocks): """ Merge blocks having same dtype, exclude non-consolidating blocks """ @@ -4154,7 +3505,7 @@ def _consolidate(blocks, items): new_blocks = [] for (_can_consolidate, dtype), group_blocks in grouper: - merged_blocks = _merge_blocks(list(group_blocks), items, dtype=dtype, + merged_blocks = _merge_blocks(list(group_blocks), dtype=dtype, _can_consolidate=_can_consolidate) if isinstance(merged_blocks, list): new_blocks.extend(merged_blocks) @@ -4164,14 +3515,7 @@ def _consolidate(blocks, items): return new_blocks -def _valid_blocks(newb): - if newb is None: - return [] - if not isinstance(newb, list): - newb = [ newb ] - return [ b for b in newb if len(b.items) > 0 ] - -def _merge_blocks(blocks, items, dtype=None, _can_consolidate=True): +def _merge_blocks(blocks, dtype=None, _can_consolidate=True): if len(blocks) == 1: return blocks[0] @@ -4182,22 +3526,15 @@ def _merge_blocks(blocks, items, dtype=None, _can_consolidate=True): raise AssertionError("_merge_blocks are invalid!") dtype = blocks[0].dtype - if not items.is_unique: - blocks = sorted(blocks, key=lambda b: b.ref_locs.tolist()) - + new_ref_locs = np.concatenate([b.ref_locs for b in blocks]) new_values = _vstack([b.values for b in blocks], dtype) - new_items = blocks[0].items.append([b.items for b in blocks[1:]]) - new_block = make_block(new_values, new_items, items) - # unique, can reindex - if items.is_unique: - return new_block.reindex_items_from(items) + argsort = np.argsort(new_ref_locs) + new_values = new_values[argsort] + new_ref_locs = new_ref_locs[argsort] - # merge the ref_locs - new_ref_locs = [b._ref_locs for b in blocks] - if all([x is not None for x in new_ref_locs]): - new_block.set_ref_locs(np.concatenate(new_ref_locs)) - return new_block + return make_block(new_values, + fastpath=True, placement=new_ref_locs) # no merge return blocks @@ -4246,3 +3583,541 @@ def _possibly_compare(a, b, op): raise TypeError("Cannot compare types %r and %r" % tuple(type_names)) return res + + + + +def _concat_indexes(indexes): + return indexes[0].append(indexes[1:]) + + +def _invert_reordering(reordering, minlength=None): + """ + Invert reordering operation. + + Given array `reordering`, make `reordering_inv` of it, such that:: + + reordering_inv[reordering[x]] = x + + There are two types of indexers: + + source + is when element *s* at position *i* means that values to fill *i-th* + item of reindex operation should be taken from *s-th* item of the + original (this is what is returned by `pandas.Index.reindex`). + destination + is when element *d* at position *i* means that values from *i-th* item + of source should be used to fill *d-th* item of reindexing operation. + + This function will convert from *source* to *destination* and vice-versa. + + .. note:: trailing ``-1`` may be lost upon conversion (this is what + `minlength` is there for). + + .. note:: if *source* indexer is not unique, corresponding *destination* + indexer will have ``dtype=object`` and will contain lists. + + Examples: + + >>> _invert_reordering([3, -1, 2, 4, -1]) + array([-1, -1, 2, 0, 3]) + >>> _invert_reordering([-1, -1, 0, 2, 3]) + array([3, -1, 2, 4]) + >>> _invert_reordering([1,3,5]) + array([-1, 0, -1, 1, -1, 2]) + + """ + reordering = np.asanyarray(reordering) + if not com.is_integer_dtype(reordering): + raise ValueError("Only integer indexers are supported") + + nonneg_indices = reordering[reordering >= 0] + counts = np.bincount(nonneg_indices, minlength=minlength) + has_non_unique = (counts > 1).any() + + dtype = np.dtype(np.object_) if has_non_unique else np.dtype(np.int_) + inverted = np.empty_like(counts, dtype=dtype) + inverted.fill(-1) + + nonneg_positions = np.arange(len(reordering), dtype=np.int_)[reordering >= 0] + np.put(inverted, nonneg_indices, nonneg_positions) + + if has_non_unique: + nonunique_elements = np.arange(len(counts))[counts > 1] + for elt in nonunique_elements: + inverted[elt] = nonneg_positions[nonneg_indices == elt].tolist() + + return inverted + + +def ref_loc_groupby_block(ref_locs): + """ + Group given ref_locs by block. + + Returns + ------- + iterator + Yield (block, block_locs, original_locs) + + """ + if len(ref_locs) == 0: + return + + blocks = com._ensure_object(lib.map_infer(ref_locs, + operator.itemgetter(0))) + indices = lib.map_infer(ref_locs, operator.itemgetter(1)) + + factorizer = Factorizer(len(blocks)) + block_ids = factorizer.factorize(blocks, na_sentinel=-1) + + for i in range(factorizer.get_count()): + locs = (block_ids == i).nonzero()[0] + yield blocks[locs[0]], indices[locs], locs + + na_locs = (block_ids == -1).nonzero()[0] + if len(na_locs): + yield None, indices[na_locs], na_locs + + +def items_overlap_with_suffix(left, lsuffix, right, rsuffix): + """ + If two indices overlap, add suffixes to overlapping entries. + + If corresponding suffix is empty, the entry is simply converted to string. + + """ + to_rename = left.intersection(right) + if len(to_rename) == 0: + return left, right + else: + if not lsuffix and not rsuffix: + raise ValueError('columns overlap but no suffix specified: %s' % + to_rename) + + def lrenamer(x): + if x in to_rename: + return '%s%s' % (x, lsuffix) + return x + + def rrenamer(x): + if x in to_rename: + return '%s%s' % (x, rsuffix) + return x + + return (_transform_index(left, lrenamer), + _transform_index(right, rrenamer)) + + +def _transform_index(index, func): + """ + Apply function to all values found in index. + + This includes transforming multiindex entries separately. + + """ + if isinstance(index, MultiIndex): + items = [tuple(func(y) for y in x) for x in index] + return MultiIndex.from_tuples(items, names=index.names) + else: + items = [func(x) for x in index] + return Index(items, name=index.name) + + +def _putmask_smart(v, m, n): + """ + Return a new block, try to preserve dtype if possible. + + Parameters + ---------- + v : array_like + m : array_like + n : array_like + """ + + # n should be the length of the mask or a scalar here + if not is_list_like(n): + n = np.array([n] * len(m)) + + # see if we are only masking values that if putted + # will work in the current dtype + try: + nn = n[m] + nn_at = nn.astype(v.dtype) + if (nn == nn_at).all(): + nv = v.copy() + nv[m] = nn_at + return nv + except (ValueError, IndexError, TypeError): + pass + + # change the dtype + dtype, _ = com._maybe_promote(n.dtype) + nv = v.astype(dtype) + try: + nv[m] = n + except ValueError: + idx, = np.where(np.squeeze(m)) + for mask_index, new_val in zip(idx, n): + nv[mask_index] = new_val + return nv + + +def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): + """ + Concatenate block managers into one. + + Parameters + ---------- + mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples + axes : list of Index + concat_axis : int + copy : bool + + """ + concat_plans = [] + + for mgr, indexers in mgrs_indexers: + plan = get_mgr_concatenation_plan(mgr, indexers) + concat_plans = combine_concat_plans(concat_plans, plan, concat_axis) + + blocks = [concatenate_by_plan(plan, concat_axis, copy=copy) + for plan in concat_plans] + + return BlockManager(blocks, axes) + + +def get_empty_dtype_and_na(join_units): + """ + Return dtype and N/A values to use when concatenating specified units. + + Returned N/A value may be None which means there was no casting involved. + + Returns + ------- + dtype + na + """ + + has_none_blocks = False + dtypes = set() + upcast_classes = set() + null_upcast_classes = set() + for unit in join_units: + if unit.block is None: + # This value is not supposed to be used anywhere, it's here to make + # sure "monotype" check (len(dtypes) == 1) fails and to indicate + # that upcasting is required. + has_none_blocks = True + continue + + dtype = unit.dtype + dtypes.add(unit.dtype) + + if issubclass(dtype.type, (np.object_, np.bool_)): + upcast_cls = 'object' + elif is_datetime64_dtype(dtype): + upcast_cls = 'datetime' + elif is_timedelta64_dtype(dtype): + upcast_cls = 'timedelta' + else: + upcast_cls = 'float' + + # Null blocks should not influence upcast class selection, unless there + # are only null blocks, when same upcasting rules must be applied to + # null upcast classes. + if unit.is_null: + null_upcast_classes.add(upcast_cls) + else: + upcast_classes.add(upcast_cls) + + if not has_none_blocks and len(dtypes) == 1: + # Unanimous decision, nothing to upcast. + return next(iter(dtypes)), None + + if not upcast_classes: + upcast_classes = null_upcast_classes + + # create the result + if 'object' in upcast_classes: + return np.dtype(np.object_), np.nan + elif 'float' in upcast_classes: + return np.dtype(np.float64), np.nan + elif 'datetime' in upcast_classes: + return np.dtype('M8[ns]'), tslib.iNaT + elif 'timedelta' in upcast_classes: + return np.dtype('m8[ns]'), tslib.iNaT + else: # pragma + raise AssertionError("invalid dtype determination in get_concat_dtype") + + +def concatenate_by_plan(plan, concat_axis, copy): + """ + Make block from concatenation plan. + """ + concat_start, join_units = plan + + empty_dtype, upcasted_na = get_empty_dtype_and_na(join_units) + + to_concat = [ju.get_reindexed_values(empty_dtype=empty_dtype, + upcasted_na=upcasted_na) + for ju in join_units] + + if len(to_concat) == 1: + # Only one block, nothing to concatenate. + if copy: + concat_values = to_concat[0].copy() + else: + concat_values = to_concat[0] + else: + concat_values = com._concat_compat(to_concat, axis=concat_axis) + + rng = np.arange(concat_values.shape[0]) + + if any(unit.is_sparse for unit in join_units): + concat_values = SparseArray(concat_values[0]) + + return make_block(concat_values, + placement=rng + concat_start) + + +def get_mgr_concatenation_plan(mgr, indexers): + """ + Construct concatenation plan for given block manager and indexers. + + Parameters + ---------- + mgr : BlockManager + indexers : dict of {axis: indexer} + + Returns + ------- + plan : list of (start_loc, [JoinUnit]) tuples + + """ + # Calculate post-reindex shape , save for item axis which will be separate + # for each block anyway. + mgr_shape = list(mgr.shape) + for ax, indexer in indexers.items(): + mgr_shape[ax] = len(indexer) + + if 0 in indexers: + indexer = indexers.pop(0) + ref_locs = com.take_1d(mgr._ref_locs, indexer, fill_value=(None, 0)) + else: + ref_locs = mgr._ref_locs + + plan = [] + for blk, blk_locs, concat_locs in ref_loc_groupby_block(ref_locs): + # result_locs are assumed to be sorted + slices = locs_to_contiguous_sequences(concat_locs) + + for slc in slices: + join_unit_indexers = indexers.copy() + axis0_blk_indexer = blk_locs[slc] + + # Omit indexer if no item reindexing is required. + if (blk is None or + np.array_equal(axis0_blk_indexer, np.arange(blk.shape[0]))): + join_unit_indexers.pop(0, None) + else: + join_unit_indexers[0] = axis0_blk_indexer + + blk_shape = copy.copy(mgr_shape) + blk_shape[0] = len(axis0_blk_indexer) + unit = JoinUnit(blk, join_unit_indexers, shape=blk_shape) + + plan.append((concat_locs[slc.start], [unit])) + + plan.sort() + return plan + + +def combine_concat_plans(existing_plan, new_plan, concat_axis): + """ + Combine multiple concatenation plans into one. + + existing_plan is updated in-place. + """ + if not existing_plan: + # Shortcut: nothing to combine with + return new_plan + + if concat_axis == 0: + # Another shortcut: when concatenating along item axis, plans can be + # simply appended. + last_offset, last_units = existing_plan[-1] + plan_offset = last_offset + last_units[0].shape[0] + return existing_plan + [(off_i + plan_offset, units_i) + for off_i, units_i in new_plan] + + from collections import deque + old_items = deque(existing_plan) + new_items = deque(new_plan) + result = [] + + while new_items: + old_start, old_units = old_items.popleft() + new_start, new_units = new_items.popleft() + + assert old_start == new_start + + old_len = old_units[0].shape[0] + new_len = new_units[0].shape[0] + + # Trim either old or new part as necessary + common_len = min(old_len, new_len) + if new_len > common_len: + new_items.appendleft((new_start + common_len, + [trim_join_unit(unit, common_len) + for unit in new_units])) + elif old_len > common_len: + old_items.appendleft((old_start + common_len, + [trim_join_unit(unit, common_len) + for unit in old_units])) + + result.append((old_start, old_units + new_units)) + + # The loop terminates when there's no new items, make sure that all old + # items are processed. + assert not old_items + + return result + + +def locs_to_contiguous_sequences(locs): + """ + Return contiguous sequences found in locs as slices. + """ + # FIXME: the code looks vaguely familiar, maybe there another version that + # can be reused instead + assert locs.ndim == 1 + length = len(locs) + + diff = np.diff(locs, axis=0) + break_locs = (diff != 1).nonzero()[0] + 1 + + if len(break_locs) == 0: + return [slice(0, length)] + else: + return [slice(b, e) + for b, e in lib.fast_zip([np.r_[0, break_locs], + np.r_[break_locs, length]])] + + +def trim_join_unit(join_unit, length): + """ + Reduce join_unit's shape along item axis to length. + + Extra items that didn't fit are returned as a separate block. + """ + + if 0 not in join_unit.indexers: + join_unit.indexers[0] = np.arange(join_unit.shape[0]) + + extra_indexers = copy.copy(join_unit.indexers) + extra_shape = copy.copy(join_unit.shape) + + extra_shape[0] = join_unit.shape[0] - length + extra_indexers[0] = extra_indexers[0][length:] + + join_unit.shape[0] = length + join_unit.indexers[0] = join_unit.indexers[0][:length] + + return JoinUnit(block=join_unit.block, indexers=extra_indexers, + shape=extra_shape) + + +class JoinUnit(object): + def __init__(self, block, indexers, shape): + # Passing shape explicitly is required for cases when block is None. + self.block = block + self.indexers = indexers + self.shape = shape + + def __repr__(self): + return '%s(%r, %s)' % (self.__class__.__name__, + self.block, self.indexers) + + @cache_readonly + def needs_filling(self): + for indexer in self.indexers.values(): + # FIXME: cache results of indexer == -1 checks. + if (indexer == -1).any(): + return True + + return False + + @cache_readonly + def dtype(self): + if self.block is None: + raise AssertionError("Block is None, no dtype") + + if not self.needs_filling: + return self.block.dtype + else: + return np.dtype(com._maybe_promote(self.block.dtype, + self.block.fill_value)[0]) + return self._dtype + + @cache_readonly + def is_null(self): + return self.block is None or isnull(self.block.values).all() + + @cache_readonly + def is_sparse(self): + return self.block is not None and self.block.is_sparse + + def get_reindexed_values(self, empty_dtype, upcasted_na): + if upcasted_na is not None: + fill_value = upcasted_na + else: + # If upcasted_na is None, self.block should always exist. If it + # doesn't (i.e. is None), then it's a bug in get_empty_dtype_and_na + # function. + fill_value = self.block.fill_value + + if self.is_null: + missing_arr = np.empty(self.shape, dtype=empty_dtype) + if np.prod(self.shape): + # NumPy 1.6 workaround: this statement gets strange if all + # blocks are of same dtype and some of them are empty: empty + # one are considered "null" so they must be filled, but no + # dtype upcasting happens and the dtype may not allow NaNs. + # + # In general, no one should get hurt when one tries to put + # incorrect values into empty array, but numpy 1.6 is strict + # about that. + missing_arr.fill(fill_value) + return missing_arr + else: + if upcasted_na is not None and self.block.is_bool: + # External code requested filling/upcasting, bool values must + # be upcasted to object to avoid being upcasted to numeric. + values = self.block.astype(np.object_).values + else: + values = self.block.get_values() + + for ax, indexer in self.indexers.items(): + values = com.take_nd(values, indexer, axis=ax, + fill_value=fill_value) + + return values + + +# def _align_kwargs(blocks, items, kwargs, align_keys, copy): +# aligned_objs = dict((k, kwargs[k]) for k in align_keys.items() +# if hasattr(kwargs[k], 'reindex_axis')) + +# if aligned_objs: +# kwargs = kwargs.copy() + +# for b in blocks: +# if aligned_objs: +# b_items = items.take(b.ref_locs) + +# for k, obj in aligned_objs.items(): +# axis = getattr(obj, '_info_axis_number', 0) +# kwargs[k] = obj.reindex_axis(b_items, axis=axis, +# copy=copy) + +# yield b, kwargs diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 7dc266617c5fd..3a977757b68ae 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -447,15 +447,17 @@ def _unstack_frame(obj, level): new_blocks = [] mask_blocks = [] for blk in obj._data.blocks: + blk_items = obj._data.items.take(blk.ref_locs) bunstacker = _Unstacker(blk.values.T, obj.index, level=level, - value_columns=blk.items) + value_columns=blk_items) new_items = bunstacker.get_new_columns() + new_placement = new_columns.get_indexer(new_items) new_values, mask = bunstacker.get_new_values() - mblk = make_block(mask.T, new_items, new_columns) + mblk = make_block(mask.T, placement=new_placement) mask_blocks.append(mblk) - newb = make_block(new_values.T, new_items, new_columns) + newb = make_block(new_values.T, placement=new_placement) new_blocks.append(newb) result = DataFrame(BlockManager(new_blocks, new_axes)) @@ -1071,10 +1073,11 @@ def make_axis_dummies(frame, axis='minor', transform=None): return DataFrame(values, columns=items, index=frame.index) -def block2d_to_blocknd(values, items, shape, labels, ref_items=None): +def block2d_to_blocknd(values, placement, shape, labels, ref_items): """ pivot to the labels shape """ from pandas.core.internals import make_block - panel_shape = (len(items),) + shape + + panel_shape = (len(placement),) + shape # TODO: lexsort depth needs to be 2!! @@ -1092,13 +1095,10 @@ def block2d_to_blocknd(values, items, shape, labels, ref_items=None): pvalues.fill(fill_value) values = values - for i in range(len(items)): + for i in range(len(placement)): pvalues[i].flat[mask] = values[:, i] - if ref_items is None: - ref_items = items - - return make_block(pvalues, items, ref_items) + return make_block(pvalues, placement=placement) def factor_indexer(shape, labels): diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 105bea92124fd..78f577566a28e 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -356,7 +356,7 @@ def encode(obj): return {'typ': 'block_manager', 'klass': obj.__class__.__name__, 'axes': data.axes, - 'blocks': [{'items': b.items, + 'blocks': [{'items': data.items.take(b.ref_locs), 'values': convert(b.values), 'shape': b.values.shape, 'dtype': b.dtype.num, @@ -481,10 +481,11 @@ def decode(obj): axes = obj['axes'] def create_block(b): - dtype = dtype_for(b['dtype']) - return make_block(unconvert(b['values'], dtype, b['compress']) - .reshape(b['shape']), b['items'], axes[0], - klass=getattr(internals, b['klass'])) + values = unconvert(b['values'], dtype_for(b['dtype']), + b['compress']).reshape(b['shape']) + return make_block(values=values, + klass=getattr(internals, b['klass']), + placement=axes[0].get_indexer(b['items'])) blocks = [create_block(b) for b in obj['blocks']] return globals()[obj['klass']](BlockManager(blocks, axes)) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 27298e52e3186..e49ab3884d312 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1704,11 +1704,11 @@ def set_kind(self): if self.typ is None: self.typ = getattr(self.description, self.cname, None) - def set_atom(self, block, existing_col, min_itemsize, + def set_atom(self, block, block_items, existing_col, min_itemsize, nan_rep, info, encoding=None, **kwargs): """ create and setup my atom from the block b """ - self.values = list(block.items) + self.values = list(block_items) dtype = block.dtype.name rvalues = block.values.ravel() inferred_type = lib.infer_dtype(rvalues) @@ -1763,7 +1763,7 @@ def set_atom(self, block, existing_col, min_itemsize, # end up here ### elif inferred_type == 'string' or dtype == 'object': self.set_atom_string( - block, + block, block_items, existing_col, min_itemsize, nan_rep, @@ -1776,8 +1776,8 @@ def set_atom(self, block, existing_col, min_itemsize, def get_atom_string(self, block, itemsize): return _tables().StringCol(itemsize=itemsize, shape=block.shape[0]) - def set_atom_string( - self, block, existing_col, min_itemsize, nan_rep, encoding): + def set_atom_string(self, block, block_items, existing_col, min_itemsize, + nan_rep, encoding): # fill nan items with myself, don't disturb the blocks by # trying to downcast block = block.fillna(nan_rep, downcast=False)[0] @@ -1789,9 +1789,9 @@ def set_atom_string( # we cannot serialize this data, so report an exception on a column # by column basis - for item in block.items: + for i, item in enumerate(block_items): - col = block.get(item) + col = block.iget(i) inferred_type = lib.infer_dtype(col.ravel()) if inferred_type != 'string': raise TypeError( @@ -2649,7 +2649,8 @@ def read(self, **kwargs): for i in range(self.nblocks): blk_items = self.read_index('block%d_items' % i) values = self.read_array('block%d_values' % i) - blk = make_block(values, blk_items, items) + blk = make_block(values, + placement=items.get_indexer(blk_items)) blocks.append(blk) return self.obj_type(BlockManager(blocks, axes)) @@ -2665,12 +2666,12 @@ def write(self, obj, **kwargs): self.write_index('axis%d' % i, ax) # Supporting mixed-type DataFrame objects...nontrivial - self.attrs.nblocks = nblocks = len(data.blocks) - for i in range(nblocks): - blk = data.blocks[i] + self.attrs.nblocks = len(data.blocks) + for i, blk in enumerate(data.blocks): # I have no idea why, but writing values before items fixed #2299 - self.write_array('block%d_values' % i, blk.values, items=blk.items) - self.write_index('block%d_items' % i, blk.items) + blk_items = data.items.take(blk.ref_locs) + self.write_array('block%d_values' % i, blk.values, items=blk_items) + self.write_index('block%d_items' % i, blk_items) class FrameFixed(BlockManagerFixed): @@ -3190,51 +3191,63 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, for a in self.non_index_axes: obj = _reindex_axis(obj, a[0], a[1]) + def get_blk_items(mgr, blocks): + return [mgr.items.take(blk.ref_locs) for blk in blocks] + # figure out data_columns and get out blocks block_obj = self.get_object(obj).consolidate() blocks = block_obj._data.blocks + blk_items = get_blk_items(block_obj._data, blocks) if len(self.non_index_axes): axis, axis_labels = self.non_index_axes[0] data_columns = self.validate_data_columns( data_columns, min_itemsize) if len(data_columns): - blocks = block_obj.reindex_axis( + mgr = block_obj.reindex_axis( Index(axis_labels) - Index(data_columns), axis=axis - )._data.blocks + )._data + + blocks = mgr.blocks + blk_items = get_blk_items(mgr, blocks) for c in data_columns: - blocks.extend( - block_obj.reindex_axis([c], axis=axis)._data.blocks) + mgr = block_obj.reindex_axis([c], axis=axis)._data + blocks.extend(mgr.blocks) + blk_items.extend(get_blk_items(mgr, mgr.blocks)) # reorder the blocks in the same order as the existing_table if we can if existing_table is not None: - by_items = dict([(tuple(b.items.tolist()), b) for b in blocks]) + by_items = dict([(tuple(b_items.tolist()), (b, b_items)) + for b, b_items in zip(blocks, blk_items)]) new_blocks = [] + new_blk_items = [] for ea in existing_table.values_axes: items = tuple(ea.values) try: - b = by_items.pop(items) + b, b_items = by_items.pop(items) new_blocks.append(b) + new_blk_items.append(b_items) except: raise ValueError( "cannot match existing table structure for [%s] on " "appending data" % ','.join(com.pprint_thing(item) for item in items)) blocks = new_blocks + blk_items = new_blk_items # add my values self.values_axes = [] - for i, b in enumerate(blocks): + for i, (b, b_items) in enumerate(zip(blocks, blk_items)): # shape of the data column are the indexable axes klass = DataCol name = None # we have a data_column - if (data_columns and len(b.items) == 1 and - b.items[0] in data_columns): + if (data_columns and len(b_items) == 1 and + b_items[0] in data_columns): klass = DataIndexableCol - name = b.items[0] + name = b_items[0] self.data_columns.append(name) # make sure that we match up the existing columns @@ -3252,7 +3265,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, try: col = klass.create_for_block( i=i, name=name, version=self.version) - col.set_atom(block=b, + col.set_atom(block=b, block_items=b_items, existing_col=existing_col, min_itemsize=min_itemsize, nan_rep=nan_rep, @@ -3268,7 +3281,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, raise Exception( "cannot find the correct atom type -> " "[dtype->%s,items->%s] %s" - % (b.dtype.name, b.items, str(detail)) + % (b.dtype.name, b_items, str(detail)) ) j += 1 @@ -3490,7 +3503,8 @@ def read(self, where=None, columns=None, **kwargs): take_labels = [l.take(sorter) for l in labels] items = Index(c.values) block = block2d_to_blocknd( - sorted_values, items, tuple(N), take_labels) + values=sorted_values, placement=np.arange(len(items)), + shape=tuple(N), labels=take_labels, ref_items=items) # create the object mgr = BlockManager([block], [items] + levels) @@ -3823,7 +3837,7 @@ def read(self, where=None, columns=None, **kwargs): if values.ndim == 1: values = values.reshape(1, values.shape[0]) - block = make_block(values, cols_, cols_) + block = make_block(values, placement=np.arange(len(cols_))) mgr = BlockManager([block], [cols_, index_]) frames.append(DataFrame(mgr)) diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py index b70248d1ef3f4..3054b75ce56ac 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/io/tests/test_pickle.py @@ -83,7 +83,6 @@ def test_read_pickles_0_13_0(self): self.read_pickles('0.13.0') def test_round_trip_current(self): - for typ, dv in self.data.items(): for dt, expected in dv.items(): diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 9c9d20e51be64..90c2681b837e8 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -3504,7 +3504,6 @@ def test_invalid_filtering(self): self.assertRaises(NotImplementedError, store.select, 'df', "columns=['A','B'] & columns=['C']") def test_string_select(self): - # GH 2973 with ensure_clean_store(self.path) as store: diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index 1c599653f9fc5..48576266c3b5f 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -541,7 +541,7 @@ def sparse_reindex(self, new_index): raise TypeError('new index must be a SparseIndex') block = self.block.sparse_reindex(new_index) - new_data = SingleBlockManager(block, block.ref_items) + new_data = SingleBlockManager(block, self.index) return self._constructor(new_data, index=self.index, sparse_index=new_index, fill_value=self.fill_value).__finalize__(self) diff --git a/pandas/sparse/tests/test_sparse.py b/pandas/sparse/tests/test_sparse.py index 7696353dca6f1..3a2f8adf719e4 100644 --- a/pandas/sparse/tests/test_sparse.py +++ b/pandas/sparse/tests/test_sparse.py @@ -1023,7 +1023,7 @@ def _compare_to_dense(a, b, da, db, op): for op in ops: _compare_to_dense(frame, frame[::2], frame.to_dense(), frame[::2].to_dense(), op) - for s in series: + for i, s in enumerate(series): _compare_to_dense(frame, s, frame.to_dense(), s.to_dense(), op) _compare_to_dense(s, frame, s.to_dense(), diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 3a3d5a822163f..5d0aa992b9407 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -106,7 +106,6 @@ class CheckIndexing(object): def test_getitem(self): # slicing - sl = self.frame[:20] self.assertEqual(20, len(sl.index)) @@ -120,7 +119,7 @@ def test_getitem(self): self.assertIsNotNone(self.frame[key]) self.assertNotIn('random', self.frame) - with assertRaisesRegexp(KeyError, 'no item named random'): + with assertRaisesRegexp(KeyError, 'random'): self.frame['random'] df = self.frame.copy() @@ -2723,6 +2722,11 @@ def test_constructor_corner(self): df = DataFrame({}, columns=['foo', 'bar']) self.assertEqual(df.values.dtype, np.object_) + df = DataFrame({'b': 1}, index=lrange(10), columns=list('abc'), + dtype=int) + self.assertEqual(df.values.dtype, np.object_) + + def test_constructor_scalar_inference(self): data = {'int': 1, 'bool': True, 'float': 3., 'complex': 4j, 'object': 'foo'} @@ -3341,7 +3345,6 @@ def test_column_dups2(self): assert_frame_equal(result, expected) def test_column_dups_indexing(self): - def check(result, expected=None): if expected is not None: assert_frame_equal(result,expected) @@ -7804,11 +7807,11 @@ def test_regex_replace_dict_mixed(self): # scalar -> dict # to_replace regex, {value: value} + expec = DataFrame({'a': mix['a'], 'b': [nan, 'b', '.', '.'], 'c': + mix['c']}) res = dfmix.replace('a', {'b': nan}, regex=True) res2 = dfmix.copy() res2.replace('a', {'b': nan}, regex=True, inplace=True) - expec = DataFrame({'a': mix['a'], 'b': [nan, 'b', '.', '.'], 'c': - mix['c']}) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) @@ -8645,7 +8648,6 @@ def test_reindex_dups(self): self.assertRaises(ValueError, df.reindex, index=list(range(len(df)))) def test_align(self): - af, bf = self.frame.align(self.frame) self.assertIsNot(af._data, self.frame._data) @@ -9789,7 +9791,7 @@ def test_reorder_levels(self): assert_frame_equal(result, expected) def test_sort_index(self): - frame = DataFrame(np.random.randn(4, 4), index=[1, 2, 3, 4], + frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4], columns=['A', 'B', 'C', 'D']) # axis=0 @@ -11820,7 +11822,7 @@ def test_columns_with_dups(self): df_dt = DataFrame(Timestamp('20010101'),index=df_float.index,columns=df_float.columns) df = pd.concat([ df_float, df_int, df_bool, df_object, df_dt ], axis=1) - result = df._data._set_ref_locs() + result = df._data._ref_locs self.assertEqual(len(result), len(df.columns)) # testing iget diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index 2c9c8a94a1902..1e4c621dd1683 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -4,6 +4,7 @@ import numpy as np from pandas import Index, MultiIndex, DataFrame, Series +from pandas.compat import OrderedDict from pandas.sparse.array import SparseArray from pandas.core.internals import * import pandas.core.internals as internals @@ -17,89 +18,142 @@ def assert_block_equal(left, right): assert_almost_equal(left.values, right.values) assert(left.dtype == right.dtype) - assert(left.items.equals(right.items)) - assert(left.ref_items.equals(right.ref_items)) + assert_almost_equal(left.ref_locs, right.ref_locs) -def get_float_mat(n, k, dtype): +def get_numeric_mat(n, k, dtype): return np.repeat(np.atleast_2d(np.arange(k, dtype=dtype)), n, axis=0) -TEST_COLS = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 's1', 's2'] -N = 10 - - -def get_float_ex(cols=['a', 'c', 'e'], dtype = np.float_): - floats = get_float_mat(N, len(cols), dtype = dtype).T - return make_block(floats, cols, TEST_COLS) - - -def get_complex_ex(cols=['h']): - complexes = (get_float_mat(N, 1, dtype = np.float_).T * 1j).astype(np.complex128) - return make_block(complexes, cols, TEST_COLS) - - -def get_obj_ex(cols=['b', 'd']): - mat = np.empty((N, 2), dtype=object) - mat[:, 0] = 'foo' - mat[:, 1] = 'bar' - return make_block(mat.T, cols, TEST_COLS) - -def get_bool_ex(cols=['f']): - mat = np.ones((N, 1), dtype=bool) - return make_block(mat.T, cols, TEST_COLS) - - -def get_int_ex(cols=['g'], dtype = np.int_): - mat = randn(N, 1).astype(dtype) - return make_block(mat.T, cols, TEST_COLS) +N = 10 -def get_dt_ex(cols=['h']): - mat = randn(N, 1).astype(int).astype('M8[ns]') - return make_block(mat.T, cols, TEST_COLS) -def get_sparse_ex1(): - sa1 = SparseArray([0, 0, 1, 2, 3, 0, 4, 5, 0, 6], fill_value=0) - return make_block(sa1, ['s1'], TEST_COLS) +def create_block(typestr, placement, num_rows=None, num_offset=None): + placement = np.asanyarray(placement) + + if num_offset is None: + num_offset = 0 + + if num_rows is None: + num_rows = N + + if typestr in ('float', 'f8', 'f4', 'f2', + 'int', 'i8', 'i4', 'i2', 'i1', + 'uint', 'u8', 'u4', 'u2', 'u1'): + values = get_numeric_mat(num_rows, len(placement), + dtype=np.dtype(typestr)).T + num_offset + elif typestr in ('complex', 'c16', 'c8'): + values = get_numeric_mat(num_rows, len(placement), + dtype=np.dtype(typestr)).T + num_offset + values *= 1.j + elif typestr in ('object', 'string', 'O'): + values = np.repeat( + np.array([['A%s' % i + for i in np.arange(len(placement)) + num_offset]]), + num_rows, axis=0).T + elif typestr in ('bool'): + values = np.ones((num_rows, len(placement)), dtype=np.bool_).T + elif typestr in ('datetime', 'dt'): + values = (randn(num_rows, len(placement)).astype(int) + .astype('M8[ns]')).T + elif typestr in ('sparse',): + # FIXME: doesn't support num_rows != 10 + assert len(placement) == 1 + assert num_rows == 10 + values = SparseArray([0, 0, 1, 2, 3, 0, 4, 5, 0, 6], fill_value=0) + arr = values.sp_values.view() + arr += (num_offset - 1) + else: + raise ValueError('Unsupported typestr: "%s"' % typestr) + + return make_block(values, placement=placement) + + +def create_mgr(descr, num_rows=None): + """ + Construct BlockManager from string description. + + String description syntax looks similar to np.matrix initializer. It looks + like this:: + + a,b,c: f8; d,e,f: i8 + + Rules are rather simple: + + * supported datatypes: + + * float, f8, f4, f2 + * int, i8, i4, i2, i1 + * uint, u8, u4, u2, u1 + * complex, c16, c8 + * bool + * object, string, O + * datetime, dt + * sparse + + * components are semicolon-separated + * each component is `NAME,NAME,NAME: DTYPE_ID` + * whitespace around colons & semicolons are removed + * components with same DTYPE_ID are combined into single block + * to force multiple blocks with same dtype, use '-SUFFIX':: + + 'a:f8-1; b:f8-2; c:f8-foobar' + + """ + if num_rows is None: + num_rows = N + + offset = 0 + mgr_items = [] + block_placements = OrderedDict() + for d in descr.split(';'): + d = d.strip() + names, blockstr = d.partition(':')[::2] + blockstr = blockstr.strip() + names = names.strip().split(',') + + mgr_items.extend(names) + placement = list(np.arange(len(names)) + offset) + try: + block_placements[blockstr].extend(placement) + except KeyError: + block_placements[blockstr] = placement + offset += len(names) -def get_sparse_ex2(): - sa2 = SparseArray([0, 0, 2, 3, 4, 0, 6, 7, 0, 8], fill_value=0) - return make_block(sa2, ['s2'], TEST_COLS) + mgr_items = Index(mgr_items) -def create_blockmanager(blocks): - l = [] - for b in blocks: - l.extend(b.items) - items = Index(l) - for b in blocks: - b.ref_items = items + blocks = [] + num_offset = 0 + for blockstr, placement in block_placements.items(): + typestr = blockstr.split('-')[0] + blocks.append(create_block(typestr, placement, num_rows=num_rows, + num_offset=num_offset,)) + num_offset += len(placement) - index_sz = blocks[0].shape[1] - return BlockManager(blocks, [items, np.arange(index_sz)]) + return BlockManager(sorted(blocks, key=lambda b: b.ref_locs[0]), + [mgr_items, np.arange(num_rows)]) -def create_singleblockmanager(blocks): - l = [] - for b in blocks: - l.extend(b.items) - items = Index(l) - for b in blocks: - b.ref_items = items - return SingleBlockManager(blocks, [items]) class TestBlock(tm.TestCase): _multiprocess_can_split_ = True def setUp(self): - self.fblock = get_float_ex() - self.cblock = get_complex_ex() - self.oblock = get_obj_ex() - self.bool_block = get_bool_ex() - self.int_block = get_int_ex() + # self.fblock = get_float_ex() # a,c,e + # self.cblock = get_complex_ex() # + # self.oblock = get_obj_ex() + # self.bool_block = get_bool_ex() + # self.int_block = get_int_ex() + + self.fblock = create_block('float', [0, 2, 4]) + self.cblock = create_block('complex', [7]) + self.oblock = create_block('object', [1, 3]) + self.bool_block = create_block('bool', [5]) + self.int_block = create_block('int', [6]) def test_constructor(self): - int32block = get_int_ex(['a'],dtype = np.int32) + int32block = create_block('i4', [0]) self.assertEqual(int32block.dtype, np.int32) def test_pickle(self): @@ -127,16 +181,16 @@ def test_merge(self): avals = randn(2, 10) bvals = randn(2, 10) - ref_cols = ['e', 'a', 'b', 'd', 'f'] + ref_cols = Index(['e', 'a', 'b', 'd', 'f']) - ablock = make_block(avals, ['e', 'b'], ref_cols) - bblock = make_block(bvals, ['a', 'd'], ref_cols) + ablock = make_block(avals, + ref_cols.get_indexer(['e', 'b'])) + bblock = make_block(bvals, + ref_cols.get_indexer(['a', 'd'])) merged = ablock.merge(bblock) - exvals = np.vstack((avals, bvals)) - excols = ['e', 'b', 'a', 'd'] - eblock = make_block(exvals, excols, ref_cols) - eblock = eblock.reindex_items_from(ref_cols) - assert_block_equal(merged, eblock) + assert_almost_equal(merged.ref_locs, [0, 1, 2, 3]) + assert_almost_equal(merged.values[[0, 2]], avals) + assert_almost_equal(merged.values[[1, 3]], bvals) # TODO: merge with mixed type? @@ -146,13 +200,16 @@ def test_copy(self): assert_block_equal(self.fblock, cop) def test_items(self): + raise nose.SkipTest('items are removed from Block') cols = self.fblock.items self.assert_numpy_array_equal(cols, ['a', 'c', 'e']) cols2 = self.fblock.items - self.assertIs(cols, cols2) + # disabled: items are generated + # self.assertIs(cols, cols2) def test_assign_ref_items(self): + raise nose.SkipTest('ref_items are removed from Block') new_cols = Index(['foo', 'bar', 'baz', 'quux', 'hi']) self.fblock.set_ref_items(new_cols) self.assert_numpy_array_equal(self.fblock.items, ['foo', 'baz', 'hi']) @@ -161,6 +218,7 @@ def test_reindex_index(self): pass def test_reindex_items_from(self): + raise nose.SkipTest('reindex_items_from is removed from Block') new_cols = Index(['e', 'b', 'c', 'f']) reindexed = self.fblock.reindex_items_from(new_cols) assert_almost_equal(reindexed.ref_locs, [0, 2]) @@ -175,19 +233,19 @@ def test_insert(self): pass def test_delete(self): - newb = self.fblock.delete('a') + newb = self.fblock.delete(0) assert_almost_equal(newb.ref_locs, [2, 4]) self.assert_((newb.values[0] == 1).all()) - newb = self.fblock.delete('c') + newb = self.fblock.delete(1) assert_almost_equal(newb.ref_locs, [0, 4]) self.assert_((newb.values[1] == 2).all()) - newb = self.fblock.delete('e') + newb = self.fblock.delete(2) assert_almost_equal(newb.ref_locs, [0, 2]) self.assert_((newb.values[1] == 1).all()) - self.assertRaises(Exception, self.fblock.delete, 'b') + self.assertRaises(Exception, self.fblock.delete, 3) def test_split_block_at(self): @@ -213,11 +271,9 @@ def test_split_block_at(self): self.assertEqual(len(bs), 0) def test_unicode_repr(self): - mat = np.empty((N, 2), dtype=object) - mat[:, 0] = 'foo' - mat[:, 1] = 'bar' - cols = ['b', u("\u05d0")] - str_repr = repr(make_block(mat.T, cols, TEST_COLS)) + raise nose.SkipTest('No items to test unicode on...') + str_repr = repr(create_block('object', [0, 1], + ref_items=['b', u("\u05d0")])) def test_get(self): pass @@ -233,68 +289,51 @@ def test_repr(self): class TestBlockManager(tm.TestCase): - _multiprocess_can_split_ = True def setUp(self): - self.blocks = [get_float_ex(), - get_obj_ex(), - get_bool_ex(), - get_int_ex(), - get_complex_ex()] - - all_items = [b.items for b in self.blocks] - - items = sorted(all_items[0].append(all_items[1:])) - items = Index(items) - for b in self.blocks: - b.ref_items = items - - self.mgr = BlockManager(self.blocks, [items, np.arange(N)]) + self.mgr = create_mgr('a: f8; b: object; c: f8; d: object; e: f8;' + 'f: bool; g: i8; h: complex') def test_constructor_corner(self): pass def test_attrs(self): - self.assertEquals(self.mgr.nblocks, len(self.mgr.blocks)) - self.assertEquals(len(self.mgr), len(self.mgr.items)) + mgr = create_mgr('a,b,c: f8-1; d,e,f: f8-2') + self.assertEquals(mgr.nblocks, 2) + self.assertEquals(len(mgr), 6) def test_is_mixed_dtype(self): - self.assertTrue(self.mgr.is_mixed_type) + self.assertFalse(create_mgr('a,b:f8').is_mixed_type) + self.assertFalse(create_mgr('a:f8-1; b:f8-2').is_mixed_type) - mgr = create_blockmanager([get_bool_ex(['a']), get_bool_ex(['b'])]) - self.assertFalse(mgr.is_mixed_type) + self.assertTrue(create_mgr('a,b:f8; c,d: f4').is_mixed_type) + self.assertTrue(create_mgr('a,b:f8; c,d: object').is_mixed_type) def test_is_indexed_like(self): - self.assertTrue(self.mgr._is_indexed_like(self.mgr)) - mgr2 = self.mgr.reindex_axis(np.arange(N - 1), axis=1) - self.assertFalse(self.mgr._is_indexed_like(mgr2)) - - def test_block_id_vector_item_dtypes(self): - expected = [0, 1, 0, 1, 0, 2, 3, 4] - result = self.mgr.block_id_vector - assert_almost_equal(expected, result) - - result = self.mgr.item_dtypes - - # as the platform may not exactly match this, pseudo match - expected = ['float64', 'object', 'float64', 'object', 'float64', - 'bool', 'int64', 'complex128'] - for e, r in zip(expected, result): - np.dtype(e).kind == np.dtype(r).kind - - def test_duplicate_item_failure(self): - items = Index(['a', 'a']) - blocks = [get_bool_ex(['a']), get_float_ex(['a'])] - for b in blocks: - b.ref_items = items + mgr1 = create_mgr('a,b: f8') + mgr2 = create_mgr('a:i8; b:bool') + mgr3 = create_mgr('a,b,c: f8') + self.assertTrue(mgr1._is_indexed_like(mgr1)) + self.assertTrue(mgr1._is_indexed_like(mgr2)) + self.assertTrue(mgr1._is_indexed_like(mgr3)) + + self.assertFalse(mgr1._is_indexed_like( + mgr1.get_slice(slice(-1), axis=1))) + + def test_duplicate_ref_loc_failure(self): + tmp_mgr = create_mgr('a:bool; a: f8') + + axes, blocks = tmp_mgr.axes, tmp_mgr.blocks - # test trying to create _ref_locs with/o ref_locs set on the blocks - self.assertRaises(AssertionError, BlockManager, blocks, [items, np.arange(N)]) + blocks[0]._ref_locs = np.array([0]) + blocks[1]._ref_locs = np.array([0]) + # test trying to create block manager with overlapping ref locs + self.assertRaises(AssertionError, BlockManager, blocks, axes) - blocks[0].set_ref_locs([0]) - blocks[1].set_ref_locs([1]) - mgr = BlockManager(blocks, [items, np.arange(N)]) + blocks[0]._ref_locs = np.array([0]) + blocks[1]._ref_locs = np.array([1]) + mgr = BlockManager(blocks, axes) mgr.iget(1) # invalidate the _ref_locs @@ -302,7 +341,7 @@ def test_duplicate_item_failure(self): b._ref_locs = None mgr._ref_locs = None mgr._items_map = None - self.assertRaises(AssertionError, mgr._set_ref_locs, do_refs=True) + self.assertRaises(Exception, mgr._rebuild_ref_locs) def test_contains(self): self.assertIn('a', self.mgr) @@ -318,7 +357,7 @@ def test_pickle(self): assert_frame_equal(DataFrame(self.mgr), DataFrame(mgr2)) # share ref_items - self.assertIs(mgr2.blocks[0].ref_items, mgr2.blocks[1].ref_items) + # self.assertIs(mgr2.blocks[0].ref_items, mgr2.blocks[1].ref_items) # GH2431 self.assertTrue(hasattr(mgr2, "_is_consolidated")) @@ -328,9 +367,6 @@ def test_pickle(self): self.assertFalse(mgr2._is_consolidated) self.assertFalse(mgr2._known_consolidated) - def test_get(self): - pass - def test_get_scalar(self): for item in self.mgr.items: for i, index in enumerate(self.mgr.axes[1]): @@ -338,8 +374,35 @@ def test_get_scalar(self): exp = self.mgr.get(item)[i] assert_almost_equal(res, exp) + def test_get(self): + cols = Index(list('abc')) + values = np.random.rand(3, 3) + block = make_block(values=values.copy(), + placement=np.arange(3)) + mgr = BlockManager(blocks=[block], axes=[cols, np.arange(3)]) + + assert_almost_equal(mgr.get('a'), values[0]) + assert_almost_equal(mgr.get('b'), values[1]) + assert_almost_equal(mgr.get('c'), values[2]) + def test_set(self): - pass + mgr = create_mgr('a,b,c: int', num_rows=3) + + mgr.set('d', np.array(['foo'] * 3)) + mgr.set('b', np.array(['bar'] * 3)) + assert_almost_equal(mgr.get('a'), [0] * 3) + assert_almost_equal(mgr.get('b'), ['bar'] * 3) + assert_almost_equal(mgr.get('c'), [2] * 3) + assert_almost_equal(mgr.get('d'), ['foo'] * 3) + + def test_insert(self): + self.mgr.insert(0, 'inserted', np.arange(N)) + + self.assertEqual(self.mgr.items[0], 'inserted') + assert_almost_equal(self.mgr.get('inserted'), np.arange(N)) + + for blk in self.mgr.blocks: + yield self.assertIs, self.mgr.items, blk.ref_items def test_set_change_dtype(self): self.mgr.set('baz', np.zeros(N, dtype=bool)) @@ -370,58 +433,68 @@ def test_copy(self): self.assertTrue(found) def test_sparse(self): - mgr = create_blockmanager([get_sparse_ex1(),get_sparse_ex2()]) + mgr = create_mgr('a: sparse-1; b: sparse-2') # what to test here? self.assertEqual(mgr.as_matrix().dtype, np.float64) def test_sparse_mixed(self): - mgr = create_blockmanager([get_sparse_ex1(),get_sparse_ex2(),get_float_ex()]) + mgr = create_mgr('a: sparse-1; b: sparse-2; c: f8') self.assertEqual(len(mgr.blocks), 3) self.assertIsInstance(mgr, BlockManager) # what to test here? def test_as_matrix_float(self): - - mgr = create_blockmanager([get_float_ex(['c'],np.float32), get_float_ex(['d'],np.float16), get_float_ex(['e'],np.float64)]) + mgr = create_mgr('c: f4; d: f2; e: f8') self.assertEqual(mgr.as_matrix().dtype, np.float64) - mgr = create_blockmanager([get_float_ex(['c'],np.float32), get_float_ex(['d'],np.float16)]) + mgr = create_mgr('c: f4; d: f2') self.assertEqual(mgr.as_matrix().dtype, np.float32) def test_as_matrix_int_bool(self): - - mgr = create_blockmanager([get_bool_ex(['a']), get_bool_ex(['b'])]) + mgr = create_mgr('a: bool-1; b: bool-2') self.assertEqual(mgr.as_matrix().dtype, np.bool_) - mgr = create_blockmanager([get_int_ex(['a'],np.int64), get_int_ex(['b'],np.int64), get_int_ex(['c'],np.int32), get_int_ex(['d'],np.int16), get_int_ex(['e'],np.uint8) ]) + mgr = create_mgr('a: i8-1; b: i8-2; c: i4; d: i2; e: u1') self.assertEqual(mgr.as_matrix().dtype, np.int64) - mgr = create_blockmanager([get_int_ex(['c'],np.int32), get_int_ex(['d'],np.int16), get_int_ex(['e'],np.uint8) ]) + mgr = create_mgr('c: i4; d: i2; e: u1') self.assertEqual(mgr.as_matrix().dtype, np.int32) def test_as_matrix_datetime(self): - mgr = create_blockmanager([get_dt_ex(['h']), get_dt_ex(['g'])]) + mgr = create_mgr('h: datetime-1; g: datetime-2') self.assertEqual(mgr.as_matrix().dtype, 'M8[ns]') def test_astype(self): - # coerce all - mgr = create_blockmanager([get_float_ex(['c'],np.float32), get_float_ex(['d'],np.float16), get_float_ex(['e'],np.float64)]) - - for t in ['float16','float32','float64','int32','int64']: + mgr = create_mgr('c: f4; d: f2; e: f8') + for t in ['float16', 'float32', 'float64', 'int32', 'int64']: + t = np.dtype(t) tmgr = mgr.astype(t) - self.assertEqual(tmgr.as_matrix().dtype, np.dtype(t)) + self.assertEqual(tmgr.get('c').dtype.type, t) + self.assertEqual(tmgr.get('d').dtype.type, t) + self.assertEqual(tmgr.get('e').dtype.type, t) # mixed - mgr = create_blockmanager([get_obj_ex(['a','b']),get_bool_ex(['c']),get_dt_ex(['d']),get_float_ex(['e'],np.float32), get_float_ex(['f'],np.float16), get_float_ex(['g'],np.float64)]) - for t in ['float16','float32','float64','int32','int64']: - tmgr = mgr.astype(t, raise_on_error = False).get_numeric_data() - self.assertEqual(tmgr.as_matrix().dtype, np.dtype(t)) + mgr = create_mgr('a,b: object; c: bool; d: datetime;' + 'e: f4; f: f2; g: f8') + for t in ['float16', 'float32', 'float64', 'int32', 'int64']: + t = np.dtype(t) + tmgr = mgr.astype(t, raise_on_error=False) + self.assertEqual(tmgr.get('c').dtype.type, t) + self.assertEqual(tmgr.get('e').dtype.type, t) + self.assertEqual(tmgr.get('f').dtype.type, t) + self.assertEqual(tmgr.get('g').dtype.type, t) + + self.assertEqual(tmgr.get('a').dtype.type, np.object_) + self.assertEqual(tmgr.get('b').dtype.type, np.object_) + if t != np.int64: + self.assertEqual(tmgr.get('d').dtype.type, np.datetime64) + else: + self.assertEqual(tmgr.get('d').dtype.type, t) def test_convert(self): - def _compare(old_mgr, new_mgr): """ compare the blocks, numeric compare ==, object don't """ old_blocks = set(old_mgr.blocks) @@ -446,45 +519,41 @@ def _compare(old_mgr, new_mgr): self.assertTrue(found) # noops - mgr = create_blockmanager([get_int_ex(['f']), get_float_ex(['g'])]) + mgr = create_mgr('f: i8; g: f8') new_mgr = mgr.convert() _compare(mgr,new_mgr) - mgr = create_blockmanager([get_obj_ex(['a','b']), get_int_ex(['f']), get_float_ex(['g'])]) + mgr = create_mgr('a, b: object; f: i8; g: f8') new_mgr = mgr.convert() _compare(mgr,new_mgr) - # there could atcually be multiple dtypes resulting - def _check(new_mgr,block_type, citems): - items = set() - for b in new_mgr.blocks: - if isinstance(b,block_type): - for i in list(b.items): - items.add(i) - self.assertEqual(items, set(citems)) - # convert - mat = np.empty((N, 3), dtype=object) - mat[:, 0] = '1' - mat[:, 1] = '2.' - mat[:, 2] = 'foo' - b = make_block(mat.T, ['a','b','foo'], TEST_COLS) - - mgr = create_blockmanager([b, get_int_ex(['f']), get_float_ex(['g'])]) - new_mgr = mgr.convert(convert_numeric = True) - - _check(new_mgr,FloatBlock,['b','g']) - _check(new_mgr,IntBlock,['a','f']) - - mgr = create_blockmanager([b, get_int_ex(['f'],np.int32), get_bool_ex(['bool']), get_dt_ex(['dt']), - get_int_ex(['i'],np.int64), get_float_ex(['g'],np.float64), get_float_ex(['h'],np.float16)]) - new_mgr = mgr.convert(convert_numeric = True) - - _check(new_mgr,FloatBlock,['b','g','h']) - _check(new_mgr,IntBlock,['a','f','i']) - _check(new_mgr,ObjectBlock,['foo']) - _check(new_mgr,BoolBlock,['bool']) - _check(new_mgr,DatetimeBlock,['dt']) + mgr = create_mgr('a,b,foo: object; f: i8; g: f8') + mgr.set('a', np.array(['1'] * N, dtype=np.object_)) + mgr.set('b', np.array(['2.'] * N, dtype=np.object_)) + mgr.set('foo', np.array(['foo.'] * N, dtype=np.object_)) + new_mgr = mgr.convert(convert_numeric=True) + self.assertEquals(new_mgr.get('a').dtype.type, np.int64) + self.assertEquals(new_mgr.get('b').dtype.type, np.float64) + self.assertEquals(new_mgr.get('foo').dtype.type, np.object_) + self.assertEquals(new_mgr.get('f').dtype.type, np.int64) + self.assertEquals(new_mgr.get('g').dtype.type, np.float64) + + mgr = create_mgr('a,b,foo: object; f: i4; bool: bool; dt: datetime;' + 'i: i8; g: f8; h: f2') + mgr.set('a', np.array(['1'] * N, dtype=np.object_)) + mgr.set('b', np.array(['2.'] * N, dtype=np.object_)) + mgr.set('foo', np.array(['foo.'] * N, dtype=np.object_)) + new_mgr = mgr.convert(convert_numeric=True) + self.assertEquals(new_mgr.get('a').dtype.type, np.int64) + self.assertEquals(new_mgr.get('b').dtype.type, np.float64) + self.assertEquals(new_mgr.get('foo').dtype.type, np.object_) + self.assertEquals(new_mgr.get('f').dtype.type, np.int32) + self.assertEquals(new_mgr.get('bool').dtype.type, np.bool_) + self.assertEquals(new_mgr.get('dt').dtype.type, np.datetime64) + self.assertEquals(new_mgr.get('i').dtype.type, np.int64) + self.assertEquals(new_mgr.get('g').dtype.type, np.float64) + self.assertEquals(new_mgr.get('h').dtype.type, np.float16) def test_interleave(self): pass @@ -512,69 +581,79 @@ def test_consolidate_ordering_issues(self): cons = self.mgr.consolidate() self.assertEquals(cons.nblocks, 1) - self.assertTrue(cons.blocks[0].items.equals(cons.items)) + assert_almost_equal(cons.blocks[0].ref_locs, + np.arange(len(cons.items))) def test_reindex_index(self): pass def test_reindex_items(self): - def _check_cols(before, after, cols): - for col in cols: - assert_almost_equal(after.get(col), before.get(col)) - - # not consolidated - vals = randn(N) - self.mgr.set('g', vals) - reindexed = self.mgr.reindex_items(['g', 'c', 'a', 'd']) + # mgr is not consolidated, f8 & f8-2 blocks + mgr = create_mgr('a: f8; b: i8; c: f8; d: i8; e: f8;' + 'f: bool; g: f8-2') + + reindexed = mgr.reindex_axis(['g', 'c', 'a', 'd'], axis=0) self.assertEquals(reindexed.nblocks, 2) - assert_almost_equal(reindexed.get('g'), vals.squeeze()) - _check_cols(self.mgr, reindexed, ['c', 'a', 'd']) + assert_almost_equal(reindexed.items, ['g', 'c', 'a', 'd']) + assert_almost_equal(mgr.get('g'), reindexed.get('g')) + assert_almost_equal(mgr.get('c'), reindexed.get('c')) + assert_almost_equal(mgr.get('a'), reindexed.get('a')) + assert_almost_equal(mgr.get('d'), reindexed.get('d')) + + def test_multiindex_xs(self): + mgr = create_mgr('a,b,c: f8; d,e,f: i8') - def test_xs(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) - self.mgr.set_axis(1, index) + mgr.set_axis(1, index) + result = mgr.xs('bar', axis=1) + self.assertEqual(result.shape, (6, 2)) + self.assertEqual(result.axes[1][0], ('bar', 'one')) + self.assertEqual(result.axes[1][1], ('bar', 'two')) - result = self.mgr.xs('bar', axis=1) - expected = self.mgr.get_slice(slice(3, 5), axis=1) + def test_get_numeric_data(self): + mgr = create_mgr('int: int; float: float; complex: complex;' + 'str: object; bool: bool; obj: object; dt: datetime', + num_rows=3) + mgr.set('obj', np.array([1, 2, 3], dtype=np.object_)) - assert_frame_equal(DataFrame(result), DataFrame(expected)) + numeric = mgr.get_numeric_data() + assert_almost_equal(numeric.items, ['int', 'float', 'complex', 'bool']) + assert_almost_equal(mgr.get('float'), numeric.get('float')) - def test_get_numeric_data(self): - int_ser = Series(np.array([0, 1, 2])) - float_ser = Series(np.array([0., 1., 2.])) - complex_ser = Series(np.array([0j, 1j, 2j])) - str_ser = Series(np.array(['a', 'b', 'c'])) - bool_ser = Series(np.array([True, False, True])) - obj_ser = Series(np.array([1, 'a', 5])) - dt_ser = Series(tm.makeDateIndex(3)) - # check types - df = DataFrame({'int': int_ser, 'float': float_ser, - 'complex': complex_ser, 'str': str_ser, - 'bool': bool_ser, 'obj': obj_ser, - 'dt': dt_ser}) - xp = DataFrame({'int': int_ser, 'float': float_ser, - 'complex': complex_ser, 'bool': bool_ser}) - rs = DataFrame(df._data.get_numeric_data()) - assert_frame_equal(xp, rs) - - xp = DataFrame({'bool': bool_ser}) - rs = DataFrame(df._data.get_bool_data()) - assert_frame_equal(xp, rs) - - rs = DataFrame(df._data.get_bool_data()) - df.ix[0, 'bool'] = not df.ix[0, 'bool'] - - self.assertEqual(rs.ix[0, 'bool'], df.ix[0, 'bool']) - - rs = DataFrame(df._data.get_bool_data(copy=True)) - df.ix[0, 'bool'] = not df.ix[0, 'bool'] - - self.assertEqual(rs.ix[0, 'bool'], not df.ix[0, 'bool']) + # Check sharing + numeric.set('float', np.array([100., 200., 300.])) + assert_almost_equal(mgr.get('float'), np.array([100., 200., 300.])) + + numeric2 = mgr.get_numeric_data(copy=True) + assert_almost_equal(numeric.items, ['int', 'float', 'complex', 'bool']) + numeric2.set('float', np.array([1000., 2000., 3000.])) + assert_almost_equal(mgr.get('float'), np.array([100., 200., 300.])) + + def test_get_bool_data(self): + mgr = create_mgr('int: int; float: float; complex: complex;' + 'str: object; bool: bool; obj: object; dt: datetime', + num_rows=3) + mgr.set('obj', np.array([True, False, True], dtype=np.object_)) + + bools = mgr.get_bool_data() + assert_almost_equal(bools.items, ['bool']) + assert_almost_equal(mgr.get('bool'), bools.get('bool')) + + bools.set('bool', np.array([True, False, True])) + assert_almost_equal(mgr.get('bool'), [True, False, True]) + + # Check sharing + bools2 = mgr.get_bool_data(copy=True) + bools2.set('bool', np.array([False, True, False])) + assert_almost_equal(mgr.get('bool'), [True, False, True]) + + def test_unicode_repr_doesnt_raise(self): + str_repr = repr(create_mgr(u('b,\u05d0: object'))) def test_missing_unicode_key(self): df = DataFrame({"a": [1]}) @@ -585,23 +664,12 @@ def test_missing_unicode_key(self): def test_equals(self): # unique items - index = Index(list('abcdef')) - block1 = make_block(np.arange(12).reshape(3,4), list('abc'), index) - block2 = make_block(np.arange(12).reshape(3,4)*10, list('def'), index) - block1.ref_items = block2.ref_items = index - bm1 = BlockManager([block1, block2], [index, np.arange(block1.shape[1])]) - bm2 = BlockManager([block2, block1], [index, np.arange(block1.shape[1])]) + bm1 = create_mgr('a,b,c: i8-1; d,e,f: i8-2') + bm2 = BlockManager(bm1.blocks[::-1], bm1.axes) self.assertTrue(bm1.equals(bm2)) - # non-unique items - index = Index(list('aaabbb')) - block1 = make_block(np.arange(12).reshape(3,4), list('aaa'), index, - placement=[0,1,2]) - block2 = make_block(np.arange(12).reshape(3,4)*10, list('bbb'), index, - placement=[3,4,5]) - block1.ref_items = block2.ref_items = index - bm1 = BlockManager([block1, block2], [index, np.arange(block1.shape[1])]) - bm2 = BlockManager([block2, block1], [index, np.arange(block1.shape[1])]) + bm1 = create_mgr('a,a,a: i8-1; b,b,b: i8-2') + bm2 = BlockManager(bm1.blocks[::-1], bm1.axes) self.assertTrue(bm1.equals(bm2)) if __name__ == '__main__': diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 1eb43237c3185..a6c2bb9f56602 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -611,7 +611,7 @@ def test_setitem_change_dtype(self): s = dft['foo', 'two'] dft['foo', 'two'] = s > s.median() assert_series_equal(dft['foo', 'two'], s > s.median()) - tm.assert_isinstance(dft._data.blocks[1].items, MultiIndex) + # tm.assert_isinstance(dft._data.blocks[1].items, MultiIndex) reindexed = dft.reindex(columns=[('foo', 'two')]) assert_series_equal(reindexed['foo', 'two'], s > s.median()) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 935dfb65a0807..d17e2e2dcb12b 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -14,12 +14,10 @@ from pandas.core.index import (Index, MultiIndex, _get_combined_index, _ensure_index, _get_consensus_names, _all_indexes_same) -from pandas.core.internals import (TimeDeltaBlock, IntBlock, BoolBlock, BlockManager, - make_block, _consolidate) -from pandas.util.decorators import cache_readonly, Appender, Substitution -from pandas.core.common import (PandasError, ABCSeries, - is_timedelta64_dtype, is_datetime64_dtype, - is_integer_dtype, isnull) +from pandas.core.internals import (items_overlap_with_suffix, + concatenate_block_managers) +from pandas.util.decorators import Appender, Substitution +from pandas.core.common import ABCSeries from pandas.io.parsers import TextFileReader import pandas.core.common as com @@ -27,7 +25,7 @@ import pandas.lib as lib import pandas.algos as algos import pandas.hashtable as _hash -import pandas.tslib as tslib + @Substitution('\nleft : DataFrame') @Appender(_merge_doc, indents=0) @@ -186,16 +184,20 @@ def __init__(self, left, right, how='inner', on=None, def get_result(self): join_index, left_indexer, right_indexer = self._get_join_info() - # this is a bit kludgy - ldata, rdata = self._get_merge_data() + ldata, rdata = self.left._data, self.right._data + lsuf, rsuf = self.suffixes + + llabels, rlabels = items_overlap_with_suffix(ldata.items, lsuf, + rdata.items, rsuf) + + lindexers = {1: left_indexer} if left_indexer is not None else {} + rindexers = {1: right_indexer} if right_indexer is not None else {} - # TODO: more efficiently handle group keys to avoid extra - # consolidation! - join_op = _BlockJoinOperation([ldata, rdata], join_index, - [left_indexer, right_indexer], axis=1, - copy=self.copy) + result_data = concatenate_block_managers( + [(ldata, lindexers), (rdata, rindexers)], + axes=[llabels.append(rlabels), join_index], + concat_axis=0, copy=self.copy) - result_data = join_op.get_result() result = DataFrame(result_data).__finalize__(self, method='merge') self._maybe_add_join_keys(result, left_indexer, right_indexer) @@ -281,8 +283,18 @@ def _get_merge_data(self): """ ldata, rdata = self.left._data, self.right._data lsuf, rsuf = self.suffixes - ldata, rdata = ldata._maybe_rename_join(rdata, lsuf, rsuf, - copydata=False) + + llabels, rlabels = items_overlap_with_suffix( + ldata.items, lsuf, rdata.items, rsuf) + + if not llabels.equals(ldata.items): + ldata = ldata.copy(deep=False) + ldata.set_axis(0, llabels) + + if not rlabels.equals(rdata.items): + rdata = rdata.copy(deep=False) + rdata.set_axis(0, rlabels) + return ldata, rdata def _get_merge_keys(self): @@ -410,14 +422,14 @@ def _validate_specification(self): if self.right_index: if len(self.left_on) != self.right.index.nlevels: raise ValueError('len(left_on) must equal the number ' - 'of levels in the index of "right"') + 'of levels in the index of "right"') self.right_on = [None] * n elif self.right_on is not None: n = len(self.right_on) if self.left_index: if len(self.right_on) != self.left.index.nlevels: raise ValueError('len(right_on) must equal the number ' - 'of levels in the index of "left"') + 'of levels in the index of "left"') self.left_on = [None] * n if len(self.right_on) != len(self.left_on): raise ValueError("len(right_on) must equal len(left_on)") @@ -487,7 +499,11 @@ def get_result(self): join_index, left_indexer, right_indexer = self._get_join_info() # this is a bit kludgy - ldata, rdata = self._get_merge_data() + ldata, rdata = self.left._data, self.right._data + lsuf, rsuf = self.suffixes + + llabels, rlabels = items_overlap_with_suffix(ldata.items, lsuf, + rdata.items, rsuf) if self.fill_method == 'ffill': left_join_indexer = algos.ffill_indexer(left_indexer) @@ -496,11 +512,14 @@ def get_result(self): left_join_indexer = left_indexer right_join_indexer = right_indexer - join_op = _BlockJoinOperation([ldata, rdata], join_index, - [left_join_indexer, right_join_indexer], - axis=1, copy=self.copy) + lindexers = {1: left_join_indexer} if left_join_indexer is not None else {} + rindexers = {1: right_join_indexer} if right_join_indexer is not None else {} + + result_data = concatenate_block_managers( + [(ldata, lindexers), (rdata, rindexers)], + axes=[llabels.append(rlabels), join_index], + concat_axis=0, copy=self.copy) - result_data = join_op.get_result() result = DataFrame(result_data) self._maybe_add_join_keys(result, left_indexer, right_indexer) @@ -640,238 +659,6 @@ def _sort_labels(uniques, left, right): return new_left, new_right -class _BlockJoinOperation(object): - """ - BlockJoinOperation made generic for N DataFrames - - Object responsible for orchestrating efficient join operation between two - BlockManager data structures - """ - def __init__(self, data_list, join_index, indexers, axis=1, copy=True): - if axis <= 0: # pragma: no cover - raise MergeError('Only axis >= 1 supported for this operation') - - if len(data_list) != len(indexers): - raise AssertionError("data_list and indexers must have the same " - "length") - - self.units = [] - for data, indexer in zip(data_list, indexers): - if not data.is_consolidated(): - data = data.consolidate() - data._set_ref_locs() - self.units.append(_JoinUnit(data.blocks, indexer)) - - self.join_index = join_index - self.axis = axis - self.copy = copy - self.offsets = None - - # do NOT sort - self.result_items = _concat_indexes([d.items for d in data_list]) - self.result_axes = list(data_list[0].axes) - self.result_axes[0] = self.result_items - self.result_axes[axis] = self.join_index - - def _prepare_blocks(self): - blockmaps = [] - - for unit in self.units: - join_blocks = unit.get_upcasted_blocks() - type_map = {} - for blk in join_blocks: - type_map.setdefault(blk.ftype, []).append(blk) - blockmaps.append((unit, type_map)) - - return blockmaps - - def get_result(self): - """ - Returns - ------- - merged : BlockManager - """ - blockmaps = self._prepare_blocks() - kinds = _get_merge_block_kinds(blockmaps) - - # maybe want to enable flexible copying <-- what did I mean? - kind_blocks = [] - for klass in kinds: - klass_blocks = [] - for unit, mapping in blockmaps: - if klass in mapping: - klass_blocks.extend((unit, b) for b in mapping[klass]) - - # blocks that we are going to merge - kind_blocks.append(klass_blocks) - - # create the merge offsets, essentially where the resultant blocks go in the result - if not self.result_items.is_unique: - - # length of the merges for each of the klass blocks - self.offsets = np.zeros(len(blockmaps)) - for kb in kind_blocks: - kl = list(b.get_merge_length() for unit, b in kb) - self.offsets += np.array(kl) - - # merge the blocks to create the result blocks - result_blocks = [] - for klass_blocks in kind_blocks: - res_blk = self._get_merged_block(klass_blocks) - result_blocks.append(res_blk) - - return BlockManager(result_blocks, self.result_axes) - - def _get_merged_block(self, to_merge): - if len(to_merge) > 1: - - # placement set here - return self._merge_blocks(to_merge) - else: - unit, block = to_merge[0] - blk = unit.reindex_block(block, self.axis, - self.result_items, copy=self.copy) - - # set placement / invalidate on a unique result - if self.result_items.is_unique and blk._ref_locs is not None: - if not self.copy: - blk = blk.copy() - blk.set_ref_locs(None) - - return blk - - - def _merge_blocks(self, merge_chunks): - """ - merge_chunks -> [(_JoinUnit, Block)] - """ - funit, fblock = merge_chunks[0] - fidx = funit.indexer - - out_shape = list(fblock.get_values().shape) - - n = len(fidx) if fidx is not None else out_shape[self.axis] - - merge_lengths = list(blk.get_merge_length() for unit, blk in merge_chunks) - out_shape[0] = sum(merge_lengths) - out_shape[self.axis] = n - - # Should use Fortran order?? - block_dtype = _get_block_dtype([x[1] for x in merge_chunks]) - out = np.empty(out_shape, dtype=block_dtype) - - sofar = 0 - for unit, blk in merge_chunks: - out_chunk = out[sofar: sofar + len(blk)] - com.take_nd(blk.get_values(), unit.indexer, self.axis, out=out_chunk) - sofar += len(blk) - - # does not sort - new_block_items = _concat_indexes([b.items for _, b in merge_chunks]) - - # need to set placement if we have a non-unique result - # calculate by the existing placement plus the offset in the result set - placement = None - if not self.result_items.is_unique: - placement = [] - offsets = np.append(np.array([0]),self.offsets.cumsum()[:-1]) - for (unit, blk), offset in zip(merge_chunks,offsets): - placement.extend(blk.ref_locs+offset) - - return make_block(out, new_block_items, self.result_items, placement=placement) - - -class _JoinUnit(object): - """ - Blocks plus indexer - """ - - def __init__(self, blocks, indexer): - self.blocks = blocks - self.indexer = indexer - - @cache_readonly - def mask_info(self): - if self.indexer is None or not _may_need_upcasting(self.blocks): - return None - else: - mask = self.indexer == -1 - needs_masking = mask.any() - return (mask, needs_masking) - - def get_upcasted_blocks(self): - # will short-circuit and not compute needs_masking if indexer is None - if self.mask_info is not None and self.mask_info[1]: - return _upcast_blocks(self.blocks) - return self.blocks - - def reindex_block(self, block, axis, ref_items, copy=True): - if self.indexer is None: - result = block.copy() if copy else block - else: - result = block.reindex_axis(self.indexer, axis=axis, - mask_info=self.mask_info) - result.ref_items = ref_items - return result - - -def _may_need_upcasting(blocks): - for block in blocks: - if isinstance(block, (IntBlock, BoolBlock)) and not isinstance(block, TimeDeltaBlock): - return True - return False - - -def _upcast_blocks(blocks): - """ - Upcast and consolidate if necessary - """ - new_blocks = [] - for block in blocks: - if isinstance(block, TimeDeltaBlock): - # these are int blocks underlying, but are ok - newb = block - elif isinstance(block, IntBlock): - newb = make_block(block.values.astype(float), block.items, - block.ref_items, placement=block._ref_locs) - elif isinstance(block, BoolBlock): - newb = make_block(block.values.astype(object), block.items, - block.ref_items, placement=block._ref_locs) - else: - newb = block - new_blocks.append(newb) - - # use any ref_items - return _consolidate(new_blocks, newb.ref_items) - - -def _get_all_block_kinds(blockmaps): - kinds = set() - for mapping in blockmaps: - kinds |= set(mapping) - return kinds - - -def _get_merge_block_kinds(blockmaps): - kinds = set() - for _, mapping in blockmaps: - kinds |= set(mapping) - return kinds - - -def _get_block_dtype(blocks): - if len(blocks) == 0: - return object - blk1 = blocks[0] - dtype = blk1.dtype - - if issubclass(dtype.type, np.floating): - for blk in blocks: - if blk.dtype.type == np.float64: - return blk.dtype - - return dtype - #---------------------------------------------------------------------- # Concatenate DataFrame objects @@ -1061,220 +848,38 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, self.new_axes = self._get_new_axes() def get_result(self): - if self._is_series and self.axis == 0: - new_data = com._concat_compat([x.get_values() for x in self.objs]) - name = com._consensus_name_attr(self.objs) - new_data = self._post_merge(new_data) - return Series(new_data, index=self.new_axes[0], name=name).__finalize__(self, method='concat') - elif self._is_series: - data = dict(zip(range(len(self.objs)), self.objs)) - index, columns = self.new_axes - tmpdf = DataFrame(data, index=index) - if columns is not None: - tmpdf.columns = columns - return tmpdf.__finalize__(self, method='concat') + if self._is_series: + if self.axis == 0: + new_data = com._concat_compat([x.get_values() for x in self.objs]) + name = com._consensus_name_attr(self.objs) + return Series(new_data, index=self.new_axes[0], name=name).__finalize__(self, method='concat') + else: + data = dict(zip(range(len(self.objs)), self.objs)) + index, columns = self.new_axes + tmpdf = DataFrame(data, index=index) + if columns is not None: + tmpdf.columns = columns + return tmpdf.__finalize__(self, method='concat') else: - new_data = self._get_concatenated_data() - new_data = self._post_merge(new_data) - return self.objs[0]._from_axes(new_data, self.new_axes).__finalize__(self, method='concat') + mgrs_indexers = [] + for obj in self.objs: + mgr = obj._data + indexers = {} + for ax, new_labels in enumerate(self.new_axes): + if ax == self.axis: + # Suppress reindexing on concat axis + continue - def _post_merge(self, data): - if isinstance(data, BlockManager): - data = data.post_merge(self.objs) - return data - - def _get_fresh_axis(self): - return Index(np.arange(len(self._get_concat_axis()))) - - def _prepare_blocks(self): - reindexed_data = self._get_reindexed_data() - - # we are consolidating as we go, so just add the blocks, no-need for dtype mapping - blockmaps = [] - for data in reindexed_data: - data = data.consolidate() - data._set_ref_locs() - blockmaps.append(data.get_block_map(typ='dict')) - return blockmaps, reindexed_data - - def _get_concatenated_data(self): - # need to conform to same other (joined) axes for block join - blockmaps, rdata = self._prepare_blocks() - kinds = _get_all_block_kinds(blockmaps) - - try: - # need to conform to same other (joined) axes for block join - new_blocks = [] - for kind in kinds: - klass_blocks = [] - for mapping in blockmaps: - l = mapping.get(kind) - if l is None: - l = [ None ] - klass_blocks.extend(l) - stacked_block = self._concat_blocks(klass_blocks) - new_blocks.append(stacked_block) - - if self.axis == 0 and self.ignore_index: - self.new_axes[0] = self._get_fresh_axis() - - for blk in new_blocks: - blk.ref_items = self.new_axes[0] - - new_data = BlockManager(new_blocks, self.new_axes) - - # Eventual goal would be to move everything to PandasError or other explicit error - except (Exception, PandasError): # EAFP - - # should not be possible to fail here for the expected reason with - # axis = 0 - if self.axis == 0: # pragma: no cover - raise - - new_data = {} - for item in self.new_axes[0]: - new_data[item] = self._concat_single_item(rdata, item) - - return new_data - - def _get_reindexed_data(self): - # HACK: ugh - - reindexed_data = [] - axes_to_reindex = list(enumerate(self.new_axes)) - axes_to_reindex.pop(self.axis) - - for obj in self.objs: - data = obj._data.prepare_for_merge() - for i, ax in axes_to_reindex: - data = data.reindex_axis(ax, axis=i, copy=False) - reindexed_data.append(data) - - return reindexed_data - - def _concat_blocks(self, blocks): - - values_list = [b.get_values() for b in blocks if b is not None] - concat_values = com._concat_compat(values_list, axis=self.axis) - - if self.axis > 0: - # Not safe to remove this check, need to profile - if not _all_indexes_same([b.items for b in blocks]): - # TODO: Either profile this piece or remove. - # FIXME: Need to figure out how to test whether this line exists or does not...(unclear if even possible - # or maybe would require performance test) - raise PandasError('dtypes are not consistent throughout ' - 'DataFrames') - return make_block(concat_values, - blocks[0].items, - self.new_axes[0], - placement=blocks[0]._ref_locs) - else: + obj_labels = mgr.axes[ax] + if not new_labels.equals(obj_labels): + indexers[ax] = obj_labels.reindex(new_labels)[1] - offsets = np.r_[0, np.cumsum([len(x._data.axes[0]) for - x in self.objs])] - indexer = np.concatenate([offsets[i] + b.ref_locs - for i, b in enumerate(blocks) - if b is not None]) - if self.ignore_index: - concat_items = indexer - else: - concat_items = self.new_axes[0].take(indexer) - - if self.ignore_index: - ref_items = self._get_fresh_axis() - return make_block(concat_values, concat_items, ref_items) - - block = make_block(concat_values, concat_items, self.new_axes[0]) - - # we need to set the ref_locs in this block so we have the mapping - # as we now have a non-unique index across dtypes, and we need to - # map the column location to the block location - # GH3602 - if not self.new_axes[0].is_unique: - block.set_ref_locs(indexer) - - return block - - def _concat_single_item(self, objs, item): - # this is called if we don't have consistent dtypes in a row-wise append - all_values = [] - dtypes = [] - alls = set() - - # figure out the resulting dtype of the combination - for data, orig in zip(objs, self.objs): - d = dict([ (t,False) for t in ['object','datetime','timedelta','other'] ]) - if item in orig: - values = data.get(item) - if hasattr(values,'to_dense'): - values = values.to_dense() - all_values.append(values) - - dtype = values.dtype - - if issubclass(dtype.type, (np.object_, np.bool_)): - d['object'] = True - alls.add('object') - elif is_datetime64_dtype(dtype): - d['datetime'] = True - alls.add('datetime') - elif is_timedelta64_dtype(dtype): - d['timedelta'] = True - alls.add('timedelta') - else: - d['other'] = True - alls.add('other') + mgrs_indexers.append((obj._data, indexers)) - else: - all_values.append(None) - d['other'] = True - alls.add('other') - - dtypes.append(d) - - if 'datetime' in alls or 'timedelta' in alls: - - if 'object' in alls or 'other' in alls: - - for v, d in zip(all_values,dtypes): - if d.get('datetime') or d.get('timedelta'): - pass - - # if we have all null, then leave a date/time like type - # if we have only that type left - elif v is None or isnull(v).all(): - - alls.discard('other') - alls.discard('object') - - # create the result - if 'object' in alls: - empty_dtype, fill_value = np.object_, np.nan - elif 'other' in alls: - empty_dtype, fill_value = np.float64, np.nan - elif 'datetime' in alls: - empty_dtype, fill_value = 'M8[ns]', tslib.iNaT - elif 'timedelta' in alls: - empty_dtype, fill_value = 'm8[ns]', tslib.iNaT - else: # pragma - raise AssertionError("invalid dtype determination in concat_single_item") - - to_concat = [] - for obj, item_values in zip(objs, all_values): - if item_values is None or isnull(item_values).all(): - shape = obj.shape[1:] - missing_arr = np.empty(shape, dtype=empty_dtype) - missing_arr.fill(fill_value) - to_concat.append(missing_arr) - else: - to_concat.append(item_values) + new_data = concatenate_block_managers( + mgrs_indexers, self.new_axes, concat_axis=self.axis, copy=True) - # this method only gets called with axis >= 1 - if self.axis < 1: - raise AssertionError("axis must be >= 1, input was" - " {0}".format(self.axis)) - return com._concat_compat(to_concat, axis=self.axis - 1) + return self.objs[0]._from_axes(new_data, self.new_axes).__finalize__(self, method='concat') def _get_result_dim(self): if self._is_series and self.axis == 1: @@ -1303,13 +908,7 @@ def _get_new_axes(self): for i, ax in zip(indices, self.join_axes): new_axes[i] = ax - if self.ignore_index: - concat_axis = None - else: - concat_axis = self._get_concat_axis() - - new_axes[self.axis] = concat_axis - + new_axes[self.axis] = self._get_concat_axis() return new_axes def _get_comb_axis(self, i): @@ -1325,9 +924,16 @@ def _get_comb_axis(self, i): return _get_combined_index(all_indexes, intersect=self.intersect) def _get_concat_axis(self): + """ + Return index to be used along concatenation axis. + """ if self._is_series: if self.axis == 0: indexes = [x.index for x in self.objs] + elif self.ignore_index: + idx = Index(np.arange(len(self.objs))) + idx.is_unique = True # arange is always unique + return idx elif self.keys is None: names = [] for x in self.objs: @@ -1338,13 +944,21 @@ def _get_concat_axis(self): if x.name is not None: names.append(x.name) else: - return Index(np.arange(len(self.objs))) + idx = Index(np.arange(len(self.objs))) + idx.is_unique = True + return idx + return Index(names) else: return _ensure_index(self.keys) else: indexes = [x._data.axes[self.axis] for x in self.objs] + if self.ignore_index: + idx = Index(np.arange(sum(len(i) for i in indexes))) + idx.is_unique = True + return idx + if self.keys is None: concat_axis = _concat_indexes(indexes) else: diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 146c244e7d775..8e11c78ecd135 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -584,6 +584,19 @@ def test_merge_different_column_key_names(self): assert_almost_equal(merged['value_x'], [2, 3, 1, 1, 4, 4, np.nan]) assert_almost_equal(merged['value_y'], [6, np.nan, 5, 8, 5, 8, 7]) + def test_merge_copy(self): + left = DataFrame({'a': 0, 'b': 1}, index=lrange(10)) + right = DataFrame({'c': 'foo', 'd': 'bar'}, index=lrange(10)) + + merged = merge(left, right, left_index=True, + right_index=True, copy=True) + + merged['a'] = 6 + self.assert_((left['a'] == 0).all()) + + merged['d'] = 'peekaboo' + self.assert_((right['d'] == 'bar').all()) + def test_merge_nocopy(self): left = DataFrame({'a': 0, 'b': 1}, index=lrange(10)) right = DataFrame({'c': 'foo', 'd': 'bar'}, index=lrange(10)) @@ -1765,11 +1778,14 @@ def test_panel_join_overlap(self): p1 = panel.ix[['ItemA', 'ItemB', 'ItemC']] p2 = panel.ix[['ItemB', 'ItemC']] + # Expected index is + # + # ItemA, ItemB_p1, ItemC_p1, ItemB_p2, ItemC_p2 joined = p1.join(p2, lsuffix='_p1', rsuffix='_p2') p1_suf = p1.ix[['ItemB', 'ItemC']].add_suffix('_p1') p2_suf = p2.ix[['ItemB', 'ItemC']].add_suffix('_p2') no_overlap = panel.ix[['ItemA']] - expected = p1_suf.join(p2_suf).join(no_overlap) + expected = no_overlap.join(p1_suf.join(p2_suf)) tm.assert_panel_equal(joined, expected) def test_panel_join_many(self): diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 23a6ae0982771..dd72a5245e7b2 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -337,7 +337,8 @@ def _take_new_index(obj, indexer, new_index, axis=0): elif isinstance(obj, DataFrame): if axis == 1: raise NotImplementedError - return DataFrame(obj._data.take(indexer, new_index=new_index, axis=1)) + return DataFrame(obj._data.reindex_indexer( + new_axis=new_index, indexer=indexer, axis=1)) else: raise NotImplementedError From f51235aef9bbb50632b569d9e0c104816e5e31a0 Mon Sep 17 00:00:00 2001 From: immerrr Date: Wed, 16 Apr 2014 21:39:55 +0400 Subject: [PATCH 2/2] CLN: rename Block.ref_locs -> mgr_locs to reduce confusion PERF: add BlockPlacement class to optimize range-like cases CLN: unify get_slice & take/reindex ops along axis0 TST: add slicing/taking/reindexing tests for BlockManagers CLN: remove unused code --- pandas/computation/tests/test_eval.py | 0 pandas/core/format.py | 2 +- pandas/core/frame.py | 2 +- pandas/core/generic.py | 5 +- pandas/core/groupby.py | 2 +- pandas/core/internals.py | 1504 ++++++++++++------------- pandas/core/reshape.py | 2 +- pandas/io/packers.py | 2 +- pandas/io/pytables.py | 6 +- pandas/lib.pyx | 424 +++++++ pandas/tests/test_frame.py | 4 +- pandas/tests/test_indexing.py | 2 +- pandas/tests/test_internals.py | 506 +++++++-- vb_suite/eval.py | 4 +- 14 files changed, 1587 insertions(+), 878 deletions(-) mode change 100755 => 100644 pandas/computation/tests/test_eval.py diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py old mode 100755 new mode 100644 diff --git a/pandas/core/format.py b/pandas/core/format.py index c76693e16494f..43eb0e890aa62 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -1292,7 +1292,7 @@ def _save_chunk(self, start_i, end_i): float_format=self.float_format, date_format=self.date_format) - for col_loc, col in zip(b.ref_locs, d): + for col_loc, col in zip(b.mgr_locs, d): # self.data is a preallocated list self.data[col_loc] = col diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c32ca065d785d..fcd2e65afddcb 100755 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1045,7 +1045,7 @@ def to_panel(self): for block in selfsorted._data.blocks: newb = block2d_to_blocknd( values=block.values.T, - placement=block.ref_locs, shape=shape, + placement=block.mgr_locs, shape=shape, labels=[major_labels, minor_labels], ref_items=selfsorted.columns) new_blocks.append(newb) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7c5c77a29f465..3f2ecd8afd2d4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1689,7 +1689,7 @@ def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True, labels, method, level, limit=limit, copy_if_needed=True) return self._reindex_with_indexers( {axis: [new_index, indexer]}, method=method, fill_value=fill_value, - limit=limit, copy=copy).__finalize__(self) + limit=limit, copy=copy) def _reindex_with_indexers(self, reindexers, method=None, fill_value=np.nan, limit=None, copy=False, @@ -1712,7 +1712,8 @@ def _reindex_with_indexers(self, reindexers, method=None, # TODO: speed up on homogeneous DataFrame objects new_data = new_data.reindex_indexer(index, indexer, axis=baxis, fill_value=fill_value, - allow_dups=allow_dups) + allow_dups=allow_dups, + copy=copy) if copy and new_data is self._data: new_data = new_data.copy() diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index b284e3c63209d..f650b41ff12be 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2243,7 +2243,7 @@ def _cython_agg_blocks(self, how, numeric_only=True): # see if we can cast the block back to the original dtype result = block._try_cast_result(result) - newb = make_block(result, placement=block.ref_locs) + newb = make_block(result, placement=block.mgr_locs) new_blocks.append(newb) if len(new_blocks) == 0: diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 9c5564941cd08..7465fad39496c 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -3,7 +3,7 @@ import re import operator from datetime import datetime, timedelta -from collections import defaultdict +from collections import defaultdict, deque import numpy as np from pandas.core.base import PandasObject @@ -14,7 +14,7 @@ ABCSparseSeries, _infer_dtype_from_scalar, _is_null_datelike_scalar, is_timedelta64_dtype, is_datetime64_dtype,) -from pandas.core.index import Index, MultiIndex, _ensure_index, _all_indexes_same +from pandas.core.index import Index, Int64Index, MultiIndex, _ensure_index from pandas.core.indexing import (_maybe_convert_indices, _length_of_indexer) import pandas.core.common as com from pandas.sparse.array import _maybe_to_sparse, SparseArray @@ -30,6 +30,10 @@ from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type + +from pandas.lib import BlockPlacement + + class Block(PandasObject): """ @@ -38,7 +42,7 @@ class Block(PandasObject): Index-ignorant; let the container take care of that """ - __slots__ = ['_ref_locs', 'values', 'ndim'] + __slots__ = ['_mgr_locs', 'values', 'ndim'] is_numeric = False is_float = False is_integer = False @@ -55,20 +59,19 @@ class Block(PandasObject): _ftype = 'dense' def __init__(self, values, placement, ndim=None, fastpath=False): - if ndim is None: ndim = values.ndim - - if values.ndim != ndim: + elif values.ndim != ndim: raise ValueError('Wrong number of dimensions') + self.ndim = ndim - if len(placement) != len(values): - raise ValueError('Wrong number of items passed %d, placement implies ' - '%d' % (len(values), len(placement))) - - self._ref_locs = np.array(placement, dtype=np.int_, copy=True) + self.mgr_locs = placement self.values = values - self.ndim = ndim + + if len(self.mgr_locs) != len(self.values): + raise ValueError('Wrong number of items passed %d,' + ' placement implies %d' % ( + len(self.values), len(self.mgr_locs))) @property def _consolidate_key(self): @@ -88,8 +91,28 @@ def fill_value(self): return np.nan @property - def ref_locs(self): - return self._ref_locs + def mgr_locs(self): + return self._mgr_locs + + def make_block_same_class(self, values, placement, copy=False, + **kwargs): + """ + Wrap given values in a block of same type as self. + + `kwargs` are used in SparseBlock override. + + """ + if copy: + values = values.copy() + return make_block(values, placement, klass=self.__class__, + fastpath=True) + + @mgr_locs.setter + def mgr_locs(self, new_mgr_locs): + if not isinstance(new_mgr_locs, BlockPlacement): + new_mgr_locs = BlockPlacement(new_mgr_locs) + + self._mgr_locs = new_mgr_locs def __unicode__(self): @@ -104,7 +127,8 @@ def __unicode__(self): shape = ' x '.join([com.pprint_thing(s) for s in self.shape]) result = '%s: %s, %s, dtype: %s' % ( - name, com.pprint_thing(self.ref_locs), shape, self.dtype) + name, com.pprint_thing(self.mgr_locs.indexer), shape, + self.dtype) return result @@ -112,29 +136,37 @@ def __len__(self): return len(self.values) def __getstate__(self): - return self.ref_locs, self.values + return self.mgr_locs.indexer, self.values def __setstate__(self, state): - self._ref_locs, self.values = state + self.mgr_locs = BlockPlacement(state[0]) + self.values = state[1] self.ndim = self.values.ndim def _slice(self, slicer): """ return a slice of my values """ return self.values[slicer] - def _getitem_block(self, slicer): + def getitem_block(self, slicer, new_mgr_locs=None): """ Perform __getitem__-like, return result as block. + + As of now, only supports slices that preserve dimensionality. + """ - if isinstance(slicer, tuple): - axis0_slicer = slicer[0] - else: - axis0_slicer = slicer + if new_mgr_locs is None: + if isinstance(slicer, tuple): + axis0_slicer = slicer[0] + else: + axis0_slicer = slicer + new_mgr_locs = self.mgr_locs[axis0_slicer] + + new_values = self._slice(slicer) + + if new_values.ndim != self.ndim: + raise ValueError("Only same dim slicing is allowed") - return self.__class__(values=self.values[slicer], - ndim=self.ndim, - fastpath=True, - placement=self.ref_locs[axis0_slicer]) + return self.make_block_same_class(new_values, new_mgr_locs) @property def shape(self): @@ -152,12 +184,6 @@ def dtype(self): def ftype(self): return "%s:%s" % (self.dtype, self._ftype) - def as_block(self, result): - """ if we are not a block, then wrap as a block, must have compatible shape """ - if not isinstance(result, Block): - result = make_block(values=result, placement=self.ref_locs,) - return result - def merge(self, other): return _merge_blocks([self, other]) @@ -175,31 +201,7 @@ def reindex_axis(self, indexer, method=None, axis=1, fill_value=None, fill_value=fill_value, mask_info=mask_info) return make_block(new_values, ndim=self.ndim, fastpath=True, - placement=self.ref_locs) - - def reindex_items_from(self, indexer, method=None, - fill_value=None, limit=None, copy=True): - """ - Reindex to only those items contained in the input set of items - - E.g. if you have ['a', 'b'], and the input items is ['b', 'c', 'd'], - then the resulting items will be ['b'] - - Returns - ------- - reindexed : Block - """ - if fill_value is None: - fill_value = self.fill_value - - # single block only - assert self.ndim == 1 - new_values = com.take_1d(self.values, indexer, - fill_value=fill_value) - block = make_block(new_values, - ndim=self.ndim, fastpath=True, - placement=np.arange(len(new_values))) - return block + placement=self.mgr_locs) def get(self, item): loc = self.items.get_loc(item) @@ -220,43 +222,18 @@ def set(self, locs, values, check=False): def delete(self, loc): """ - Returns - ------- - y : Block (new object) + Delete given loc(-s) from block in-place. """ - new_values = np.delete(self.values, loc, 0) - return make_block(new_values, - ndim=self.ndim, klass=self.__class__, fastpath=True, - placement=np.delete(self.ref_locs, loc)) - - def split_block_at(self, item): - """ - Split block into zero or more blocks around columns with given label, - for "deleting" a column without having to copy data by returning views - on the original array. - - Returns - ------- - generator of Block - """ - loc = self.items.get_loc(item) - - if type(loc) == slice or type(loc) == int: - mask = [True] * len(self) - mask[loc] = False - else: # already a mask, inverted - mask = -loc - - for s, e in com.split_ranges(mask): - # FIXME: drop this function - yield make_block(self.values[s:e], - ndim=self.ndim, - klass=self.__class__, - fastpath=True) + self.values = np.delete(self.values, loc, 0) + self.mgr_locs = self.mgr_locs.delete(loc) def apply(self, func, **kwargs): """ apply the function to my values; return a block if we are not one """ - return self.as_block(func(self.values)) + result = func(self.values) + if not isinstance(result, Block): + result = make_block(values=result, placement=self.mgr_locs,) + + return result def fillna(self, value, limit=None, inplace=False, downcast=None): if not self._can_hold_na: @@ -308,7 +285,7 @@ def downcast(self, dtypes=None): nv = _possibly_downcast_to_dtype(values, dtypes) return [make_block(nv, ndim=self.ndim, - fastpath=True, placement=self.ref_locs)] + fastpath=True, placement=self.mgr_locs)] # ndim > 1 if dtypes is None: @@ -321,7 +298,7 @@ def downcast(self, dtypes=None): # item-by-item # this is expensive as it splits the blocks items-by-item blocks = [] - for i, rl in enumerate(self.ref_locs): + for i, rl in enumerate(self.mgr_locs): if dtypes == 'infer': dtype = 'infer' @@ -364,7 +341,7 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None, values = com._astype_nansafe(self.values.ravel(), dtype, copy=True) values = values.reshape(self.values.shape) newb = make_block(values, - ndim=self.ndim, placement=self.ref_locs, + ndim=self.ndim, placement=self.mgr_locs, fastpath=True, dtype=dtype, klass=klass) except: if raise_on_error is True: @@ -387,38 +364,6 @@ def convert(self, copy=True, **kwargs): return [self.copy()] if copy else [self] - def prepare_for_merge(self, **kwargs): - """ a regular block is ok to merge as is """ - return self - - def post_merge(self, items, **kwargs): - """ we are non-sparse block, try to convert to a sparse block(s) """ - sparsified_mask = self.items.isin(items.keys()) - - if not sparsified_mask.any(): - return self - - new_blocks = [] - for i in sparsified_mask.nonzero()[0]: - item = self.items[i] - ref_loc = self.ref_locs[i] - - dtypes = set(items[item]) - # this is a safe bet with multiple dtypes - dtype = list(dtypes)[0] if len(dtypes) == 1 else np.float64 - - new_blocks.append(make_block( - values=SparseArray(self.iget(i), dtype=dtype), - placement=[ref_loc])) - - nonsparsified_locs = (~sparsified_mask).nonzero()[0] - if len(nonsparsified_locs): - new_blocks.append(make_block( - values=self.values[nonsparsified_locs], - placement=self.ref_locs[nonsparsified_locs])) - - return new_blocks - def _can_hold_element(self, value): raise NotImplementedError() @@ -490,7 +435,7 @@ def copy(self, deep=True): values = values.copy() return make_block(values, ndim=self.ndim, klass=self.__class__, fastpath=True, - placement=self.ref_locs) + placement=self.mgr_locs) def replace(self, to_replace, value, inplace=False, filter=None, regex=False): @@ -500,7 +445,7 @@ def replace(self, to_replace, value, inplace=False, filter=None, compatibility.""" mask = com.mask_missing(self.values, to_replace) if filter is not None: - filtered_out = ~Index(self.ref_locs, copy=False).isin(filter) + filtered_out = ~self.mgr_locs.isin(filter) mask[filtered_out.nonzero()[0]] = False if not mask.any(): @@ -573,7 +518,7 @@ def setitem(self, indexer, value): values = self._try_coerce_result(values) values = self._try_cast_result(values, dtype) return [make_block(transf(values), - ndim=self.ndim, placement=self._ref_locs, + ndim=self.ndim, placement=self.mgr_locs, fastpath=True)] except (ValueError, TypeError) as detail: raise @@ -629,7 +574,7 @@ def putmask(self, mask, new, align=True, inplace=False): # need to go column by column new_blocks = [] if self.ndim > 1: - for i, ref_loc in enumerate(self.ref_locs): + for i, ref_loc in enumerate(self.mgr_locs): m = mask[i] v = new_values[i] @@ -660,7 +605,7 @@ def putmask(self, mask, new, align=True, inplace=False): else: nv = _putmask_smart(new_values, mask, new) new_blocks.append(make_block(values=nv, - placement=self.ref_locs, + placement=self.mgr_locs, fastpath=True)) return new_blocks @@ -669,7 +614,7 @@ def putmask(self, mask, new, align=True, inplace=False): return [self] return [make_block(new_values, - placement=self.ref_locs, fastpath=True)] + placement=self.mgr_locs, fastpath=True)] def interpolate(self, method='pad', axis=0, index=None, values=None, inplace=False, limit=None, @@ -750,7 +695,7 @@ def _interpolate_with_fill(self, method='pad', axis=0, inplace=False, blocks = [make_block(values, ndim=self.ndim, klass=self.__class__, - fastpath=True, placement=self.ref_locs)] + fastpath=True, placement=self.mgr_locs)] return self._maybe_downcast(blocks, downcast) def _interpolate(self, method=None, index=None, values=None, @@ -789,23 +734,37 @@ def func(x): blocks = [make_block(interp_values, ndim=self.ndim, klass=self.__class__, - fastpath=True, placement=self.ref_locs)] + fastpath=True, placement=self.mgr_locs)] return self._maybe_downcast(blocks, downcast) - def take(self, indexer, new_axis, axis=1): - if axis < 1: - raise AssertionError('axis must be at least 1, got %d' % axis) - new_values = com.take_nd(self.values, indexer, axis=axis, - allow_fill=False) + def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None): + """ + Take values according to indexer and return them as a block.bb - # need to preserve the ref_locs and just shift them - # GH6121 - ref_locs = None - if not new_axis.is_unique: - ref_locs = self._ref_locs + """ + if fill_tuple is None: + fill_value = self.fill_value + new_values = com.take_nd(self.get_values(), indexer, axis=axis, + allow_fill=False) + else: + fill_value = fill_tuple[0] + new_values = com.take_nd(self.get_values(), indexer, axis=axis, + allow_fill=True, fill_value=fill_value) + + if new_mgr_locs is None: + if axis == 0: + slc = lib.indexer_as_slice(indexer) + if slc is not None: + new_mgr_locs = self.mgr_locs[slc] + else: + new_mgr_locs = self.mgr_locs[indexer] + else: + new_mgr_locs = self.mgr_locs - return [make_block(new_values, ndim=self.ndim, - klass=self.__class__, placement=ref_locs, fastpath=True)] + if new_values.dtype != self.dtype: + return make_block(new_values, new_mgr_locs) + else: + return self.make_block_same_class(new_values, new_mgr_locs) def get_values(self, dtype=None): return self.values @@ -815,7 +774,7 @@ def diff(self, n): new_values = com.diff(self.values, n, axis=1) return [make_block(values=new_values, ndim=self.ndim, fastpath=True, - placement=self.ref_locs)] + placement=self.mgr_locs)] def shift(self, periods, axis=0): """ shift the block by periods, possibly upcast """ @@ -841,7 +800,7 @@ def shift(self, periods, axis=0): return [make_block(new_values, ndim=self.ndim, fastpath=True, - placement=self.ref_locs)] + placement=self.mgr_locs)] def eval(self, func, other, raise_on_error=True, try_cast=False): """ @@ -933,7 +892,7 @@ def handle_error(): result = self._try_cast_result(result) return [make_block(result, ndim=self.ndim, - fastpath=True, placement=self.ref_locs)] + fastpath=True, placement=self.mgr_locs)] def where(self, other, cond, align=True, raise_on_error=True, try_cast=False): @@ -1024,7 +983,7 @@ def func(c, v, o): result = self._try_cast_result(result) return make_block(result, - ndim=self.ndim, placement=self.ref_locs) + ndim=self.ndim, placement=self.mgr_locs) # might need to separate out blocks axis = cond.ndim - 1 @@ -1038,7 +997,7 @@ def func(c, v, o): r = self._try_cast_result( result.take(m.nonzero()[0], axis=axis)) result_blocks.append(make_block(r.T, - placement=self.ref_locs[m])) + placement=self.mgr_locs[m])) return result_blocks @@ -1048,11 +1007,13 @@ def equals(self, other): class NumericBlock(Block): + __slots__ = () is_numeric = True _can_hold_na = True class FloatOrComplexBlock(NumericBlock): + __slots__ = () def equals(self, other): if self.dtype != other.dtype or self.shape != other.shape: return False @@ -1060,6 +1021,7 @@ def equals(self, other): return ((left == right) | (np.isnan(left) & np.isnan(right))).all() class FloatBlock(FloatOrComplexBlock): + __slots__ = () is_float = True _downcast_dtype = 'int64' @@ -1100,6 +1062,7 @@ def should_store(self, value): class ComplexBlock(FloatOrComplexBlock): + __slots__ = () is_complex = True def _can_hold_element(self, element): @@ -1120,6 +1083,7 @@ def should_store(self, value): class IntBlock(NumericBlock): + __slots__ = () is_integer = True _can_hold_na = False @@ -1140,6 +1104,7 @@ def should_store(self, value): class TimeDeltaBlock(IntBlock): + __slots__ = () is_timedelta = True _can_hold_na = True is_numeric = False @@ -1224,6 +1189,7 @@ def to_native_types(self, slicer=None, na_rep=None, **kwargs): class BoolBlock(NumericBlock): + __slots__ = () is_bool = True _can_hold_na = False @@ -1251,7 +1217,9 @@ def replace(self, to_replace, value, inplace=False, filter=None, inplace=inplace, filter=filter, regex=regex) + class ObjectBlock(Block): + __slots__ = () is_object = True _can_hold_na = True @@ -1284,7 +1252,7 @@ def convert(self, convert_dates=True, convert_numeric=True, convert_timedeltas=T blocks = [] if by_item and not self._is_single_block: - for i, rl in enumerate(self.ref_locs): + for i, rl in enumerate(self.mgr_locs): values = self.iget(i) values = com._possibly_convert_objects( @@ -1304,7 +1272,7 @@ def convert(self, convert_dates=True, convert_numeric=True, convert_timedeltas=T convert_numeric=convert_numeric ).reshape(self.values.shape) blocks.append(make_block(values, - ndim=self.ndim, placement=self.ref_locs)) + ndim=self.ndim, placement=self.mgr_locs)) return blocks @@ -1456,17 +1424,17 @@ def re_replacer(s): if filter is None: filt = slice(None) else: - filt = (Index(self.ref_locs, copy=False) - .isin(filter).nonzero()[0]) + filt = self.mgr_locs.isin(filter).nonzero()[0] new_values[filt] = f(new_values[filt]) return [self if inplace else make_block(new_values, - fastpath=True, placement=self.ref_locs)] + fastpath=True, placement=self.mgr_locs)] class DatetimeBlock(Block): + __slots__ = () is_datetime = True _can_hold_na = True @@ -1548,7 +1516,7 @@ def fillna(self, value, limit=None, np.putmask(values, mask, value) return [self if inplace else make_block(values, - fastpath=True, placement=self.ref_locs)] + fastpath=True, placement=self.mgr_locs)] def to_native_types(self, slicer=None, na_rep=None, date_format=None, **kwargs): @@ -1611,9 +1579,8 @@ def get_values(self, dtype=None): class SparseBlock(Block): - """ implement as a list of sparse arrays of the same dtype """ - __slots__ = ['_ref_locs', 'ndim', 'values'] + __slots__ = () is_sparse = True is_numeric = True _can_hold_na = True @@ -1625,25 +1592,23 @@ def __init__(self, values, placement, ndim=None, fastpath=False,): # kludgetastic - if ndim is not None: - if ndim == 1: - ndim = 1 - elif ndim > 2: - ndim = ndim - else: + if ndim is None: if len(placement) != 1: ndim = 1 else: ndim = 2 self.ndim = ndim - self._ref_locs = np.array(placement, dtype=np.int_, copy=True) + self.mgr_locs = placement + + if not isinstance(values, SparseArray): + raise TypeError("values must be SparseArray") self.values = values @property def shape(self): - return (len(self.ref_locs), self.sp_index.length) + return (len(self.mgr_locs), self.sp_index.length) @property def itemsize(self): @@ -1651,6 +1616,7 @@ def itemsize(self): @property def fill_value(self): + #return np.nan return self.values.fill_value @fill_value.setter @@ -1669,7 +1635,8 @@ def sp_values(self, v): # reset the sparse values self.values = SparseArray(v, sparse_index=self.sp_index, kind=self.kind, dtype=v.dtype, - fill_value=self.fill_value, copy=False) + fill_value=self.values.fill_value, + copy=False) def iget(self, col): if col != 0: @@ -1716,19 +1683,38 @@ def get_values(self, dtype=None): return values def copy(self, deep=True): - return self.make_block(values=self.values, - sparse_index=self.sp_index, - kind=self.kind, copy=deep, - placement=self.ref_locs) - - def make_block(self, values, placement, - sparse_index=None, kind=None, dtype=None, fill_value=None, - copy=False, fastpath=True): + return self.make_block_same_class(values=self.values, + sparse_index=self.sp_index, + kind=self.kind, copy=deep, + placement=self.mgr_locs) + + def make_block_same_class(self, values, placement, + sparse_index=None, kind=None, dtype=None, + fill_value=None, copy=False, fastpath=True): """ return a new block """ if dtype is None: dtype = self.dtype if fill_value is None: - fill_value = self.fill_value + fill_value = self.values.fill_value + + # if not isinstance(values, SparseArray) and values.ndim != self.ndim: + # raise ValueError("ndim mismatch") + + if values.ndim == 2: + nitems = values.shape[0] + + if nitems == 0: + # kludgy, but SparseBlocks cannot handle slices, where the + # output is 0-item, so let's convert it to a dense block: it + # won't take space since there's 0 items, plus it will preserve + # the dtype. + return make_block(np.empty(values.shape, dtype=dtype), + placement, fastpath=True,) + elif nitems > 1: + raise ValueError("Only 1-item 2d sparse blocks are supported") + else: + values = values.reshape(values.shape[1]) + new_values = SparseArray(values, sparse_index=sparse_index, kind=kind or self.kind, dtype=dtype, fill_value=fill_value, copy=copy) @@ -1740,8 +1726,8 @@ def interpolate(self, method='pad', axis=0, inplace=False, values = com.interpolate_2d( self.values.to_dense(), method, axis, limit, fill_value) - return self.make_block(values=values, - placement=self.ref_locs) + return self.make_block_same_class(values=values, + placement=self.mgr_locs) def fillna(self, value, limit=None, inplace=False, downcast=None): # we may need to upcast our fill to match our dtype @@ -1750,9 +1736,9 @@ def fillna(self, value, limit=None, inplace=False, downcast=None): if issubclass(self.dtype.type, np.floating): value = float(value) values = self.values if inplace else self.values.copy() - return [self.make_block(values=values.get_values(value), - fill_value=value, placement=self.ref_locs)] - + return [self.make_block_same_class(values=values.get_values(value), + fill_value=value, + placement=self.mgr_locs)] def shift(self, periods, axis=0): """ shift the block by periods """ @@ -1770,15 +1756,7 @@ def shift(self, periods, axis=0): new_values[:periods] = fill_value else: new_values[periods:] = fill_value - return [self.make_block(new_values, placement=self.ref_locs)] - - def take(self, indexer, new_axis, axis=1): - """ going to take our items - along the long dimension""" - if axis < 1: - raise AssertionError('axis must be at least 1, got %d' % axis) - - return [self.make_block(self.values.take(indexer))] + return [self.make_block_same_class(new_values, placement=self.mgr_locs)] def reindex_axis(self, indexer, method=None, axis=1, fill_value=None, limit=None, mask_info=None): @@ -1791,41 +1769,9 @@ def reindex_axis(self, indexer, method=None, axis=1, fill_value=None, # taking on the 0th axis always here if fill_value is None: fill_value = self.fill_value - return self.make_block(self.values.take(indexer), - fill_value=fill_value, - placement=self.ref_locs) - - def reindex_items_from(self, indexer, method=None, - fill_value=None, limit=None, copy=True): - """ - Reindex to only those items contained in the input set of items - - E.g. if you have ['a', 'b'], and the input items is ['b', 'c', 'd'], - then the resulting items will be ['b'] - - Returns - ------- - reindexed : Block - """ - - # 1-d always - if indexer is None: - indexer = np.arange(len(self.ref_locs)) - - # single block only - assert self.ndim == 1 - new_values = com.take_1d(self.values.values, indexer) - - # fill if needed - if method is not None or limit is not None: - if fill_value is None: - fill_value = self.fill_value - new_values = com.interpolate_2d(new_values, method=method, - limit=limit, fill_value=fill_value) - - return self.make_block(new_values, - copy=copy, - placement=np.arange(len(indexer))) + return self.make_block_same_class(self.values.take(indexer), + fill_value=fill_value, + placement=self.mgr_locs) def sparse_reindex(self, new_index): """ sparse reindex and return a new block @@ -1833,13 +1779,8 @@ def sparse_reindex(self, new_index): values = self.values values = values.sp_index.to_int_index().reindex( values.sp_values.astype('float64'), values.fill_value, new_index) - return self.make_block(values, sparse_index=new_index, - placement=self.ref_locs) - - def split_block_at(self, item): - if len(self.items) == 1 and item == self.items[0]: - return [] - return super(SparseBlock, self).split_block_at(self, item) + return self.make_block_same_class(values, sparse_index=new_index, + placement=self.mgr_locs) def _try_cast_result(self, result, dtype=None): return result @@ -1953,15 +1894,15 @@ class BlockManager(PandasObject): This is *not* a public API class """ __slots__ = ['axes', 'blocks', '_ndim', '_shape', '_known_consolidated', - '_is_consolidated', '_has_sparse', '_ref_locs'] + '_is_consolidated', '_blknos', '_blklocs'] def __init__(self, blocks, axes, do_integrity_check=True, fastpath=True): self.axes = [_ensure_index(ax) for ax in axes] - self.blocks = blocks + self.blocks = tuple(blocks) for block in blocks: if block.is_sparse: - if len(block.ref_locs) != 1: + if len(block.mgr_locs) != 1: raise AssertionError("Sparse block refers to multiple items") else: if self.ndim != block.ndim: @@ -1972,10 +1913,9 @@ def __init__(self, blocks, axes, do_integrity_check=True, fastpath=True): if do_integrity_check: self._verify_integrity() - self._has_sparse = False self._consolidate_check() - self._rebuild_ref_locs() + self._rebuild_blknos_and_blklocs() def make_empty(self, axes=None): """ return an empty BlockManager with the items axis of len 0 """ @@ -2016,23 +1956,60 @@ def set_axis(self, axis, new_labels): self.axes[axis] = new_labels - def _rebuild_ref_locs(self): + def rename_axis(self, mapper, axis, copy=True): """ - Update mgr._ref_locs according to blk.ref_locs. + Rename one of axes. + + Parameters + ---------- + mapper : unary callable + axis : int + copy : boolean, default True + """ - blocks = np.empty(self.shape[0], dtype=np.object_) - blk_locs = np.empty(self.shape[0], dtype=np.int_) - blk_locs.fill(-1) + obj = self.copy(deep=copy) + obj.set_axis(axis, _transform_index(self.axes[axis], mapper)) + return obj - for blk in self.blocks: - rl = blk.ref_locs - blocks[rl] = blk - blk_locs[rl] = np.arange(len(rl)) + def add_prefix(self, prefix): + f = (str(prefix) + '%s').__mod__ + return self.rename_axis(f, axis=0) + + def add_suffix(self, suffix): + f = ('%s' + str(suffix)).__mod__ + return self.rename_axis(f, axis=0) + + @property + def _is_single_block(self): + if self.ndim == 1: + return True + + if len(self.blocks) != 1: + return False + + blk = self.blocks[0] + return (blk.mgr_locs.is_slice_like and + blk.mgr_locs.as_slice == slice(0, len(self), 1)) + + def _rebuild_blknos_and_blklocs(self): + """ + Update mgr._blknos / mgr._blklocs. + """ + new_blknos = np.empty(self.shape[0], dtype=np.int64) + new_blklocs = np.empty(self.shape[0], dtype=np.int64) + new_blknos.fill(-1) + new_blklocs.fill(-1) + + for blkno, blk in enumerate(self.blocks): + rl = blk.mgr_locs + new_blknos[rl.indexer] = blkno + new_blklocs[rl.indexer] = np.arange(len(rl)) - if (blk_locs == -1).any(): + if (new_blknos == -1).any(): raise AssertionError("Gaps in blk ref_locs") - self._ref_locs = lib.fast_zip([blocks, blk_locs]) + self._blknos = new_blknos + self._blklocs = new_blklocs # make items read only for now def _get_items(self): @@ -2055,14 +2032,16 @@ def get_ftype_counts(self): return self._get_counts(lambda b: b.ftype) def get_dtypes(self): - return [rl[0].dtype for rl in self._ref_locs] + dtypes = np.array([blk.dtype for blk in self.blocks]) + return dtypes.take(self._blknos) def get_ftypes(self): - return [rl[0].ftype for rl in self._ref_locs] + ftypes = np.array([blk.ftype for blk in self.blocks]) + return ftypes.take(self._blknos) def __getstate__(self): block_values = [b.values for b in self.blocks] - block_items = [self.items.take(b.ref_locs) for b in self.blocks] + block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks] axes_array = [ax for ax in self.axes] return axes_array, block_values, block_items @@ -2083,15 +2062,14 @@ def __setstate__(self, state): blk = make_block(values, placement=self.axes[0].get_indexer(items)) blocks.append(blk) - self.blocks = blocks + self.blocks = tuple(blocks) self._post_setstate() def _post_setstate(self): self._is_consolidated = False self._known_consolidated = False - self._rebuild_ref_locs() - self._set_has_sparse() + self._rebuild_blknos_and_blklocs() def __len__(self): return len(self.items) @@ -2110,7 +2088,7 @@ def __unicode__(self): def _verify_integrity(self): mgr_shape = self.shape - tot_items = sum(len(x.ref_locs) for x in self.blocks) + tot_items = sum(len(x.mgr_locs) for x in self.blocks) for block in self.blocks: if not block.is_sparse and block.shape[1:] != mgr_shape[1:]: construction_error(tot_items, block.shape[1:], self.axes) @@ -2140,10 +2118,14 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False, **kwargs): result_blocks = [] + # filter kwarg is used in replace-* family of methods if filter is not None: - # filter kwarg is used in replace-* family of methods filter_locs = set(self.items.get_indexer_for(filter)) - kwargs['filter'] = filter_locs + if len(filter_locs) == len(self.items): + # All items are included, as if there were no filtering + filter = None + else: + kwargs['filter'] = filter_locs if f == 'where' and kwargs.get('align', True): align_copy = True @@ -2167,13 +2149,12 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False, **kwargs): for b in self.blocks: if filter is not None: - valid_locs = filter_locs.intersection(b.ref_locs) - if not valid_locs: + if not b.mgr_locs.isin(filter_locs).any(): result_blocks.append(b) continue if aligned_args: - b_items = self.items.take(b.ref_locs) + b_items = self.items[b.mgr_locs.indexer] for k, obj in aligned_args.items(): axis = getattr(obj, '_info_axis_number', 0) @@ -2265,7 +2246,7 @@ def comp(s): else: # get our mask for this element, sized to this # particular block - m = masks[i][b.ref_locs] + m = masks[i][b.mgr_locs.indexer] if m.any(): new_rb.extend(b.putmask(m, d, inplace=True)) else: @@ -2289,10 +2270,6 @@ def _consolidate_check(self): ftypes = [blk.ftype for blk in self.blocks] self._is_consolidated = len(ftypes) == len(set(ftypes)) self._known_consolidated = True - self._set_has_sparse() - - def _set_has_sparse(self): - self._has_sparse = any((blk.is_sparse for blk in self.blocks)) @property def is_mixed_type(self): @@ -2337,14 +2314,15 @@ def combine(self, blocks, copy=True): if len(blocks) == 0: return self.make_empty() - indexer = np.sort(np.concatenate([b.ref_locs for b in blocks])) + # FIXME: optimization potential + indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks])) inv_indexer = _invert_reordering(indexer) new_items = self.items.take(indexer) new_blocks = [] for b in blocks: b = b.copy(deep=copy) - b._ref_locs = inv_indexer.take(b.ref_locs) + b.mgr_locs = inv_indexer.take(b.mgr_locs.as_array) new_blocks.append(b) new_axes = list(self.axes) @@ -2352,36 +2330,22 @@ def combine(self, blocks, copy=True): return self.__class__(new_blocks, new_axes, do_integrity_check=False) def get_slice(self, slobj, axis=0): - new_axes = list(self.axes) - new_axes[axis] = new_axes[axis][slobj] + if axis >= self.ndim: + raise IndexError("Requested axis not found in manager") if axis == 0: - new_items = new_axes[0] - - # we want to preserver the view of a single-block - if (len(self.blocks) == 1 and - (self.blocks[0]._ref_locs == np.arange(self.shape[0])).all()): - blk = self.blocks[0] - newb = make_block(blk._slice(slobj), - klass=blk.__class__, fastpath=True, - placement=np.arange(len(new_items))) - - new_blocks = [newb] - else: - return self.reindex_indexer( - new_items, indexer=np.arange(len(self.items))[slobj], - axis=0, allow_dups=True) + new_blocks = self._slice_take_blocks_ax0(slobj) else: - slicer = [slice(None)] * self.ndim + slicer = [slice(None)] * (axis + 1) slicer[axis] = slobj + slicer = tuple(slicer) + new_blocks = [blk.getitem_block(slicer) for blk in self.blocks] - new_blocks = [make_block(block._slice(slicer), - klass=block.__class__, - fastpath=True, - placement=block.ref_locs) - for block in self.blocks] + new_axes = list(self.axes) + new_axes[axis] = new_axes[axis][slobj] - bm = self.__class__(new_blocks, new_axes, do_integrity_check=False) + bm = self.__class__(new_blocks, new_axes, do_integrity_check=False, + fastpath=True) bm._consolidate_inplace() return bm @@ -2421,9 +2385,7 @@ def as_matrix(self, items=None): else: mgr = self - if (len(mgr.blocks) == 1 and - (mgr.blocks[0]._ref_locs is None or - (mgr.blocks[0]._ref_locs == np.arange(mgr.shape[0])).all())): + if self._is_single_block: return mgr.blocks[0].get_values() else: return mgr._interleave() @@ -2436,12 +2398,25 @@ def _interleave(self): dtype = _interleaved_dtype(self.blocks) result = np.empty(self.shape, dtype=dtype) + + if result.shape[0] == 0: + # Workaround for numpy 1.7 bug: + # + # >>> a = np.empty((0,10)) + # >>> a[slice(0,0)] + # array([], shape=(0, 10), dtype=float64) + # >>> a[[]] + # Traceback (most recent call last): + # File "", line 1, in + # IndexError: index 0 is out of bounds for axis 0 with size 0 + return result + itemmask = np.zeros(self.shape[0]) for blk in self.blocks: - rl = blk.ref_locs - result[rl] = blk.get_values(dtype) - itemmask[rl] = 1 + rl = blk.mgr_locs + result[rl.indexer] = blk.get_values(dtype) + itemmask[rl.indexer] = 1 if not itemmask.all(): raise AssertionError('Some items were not contained in blocks') @@ -2477,14 +2452,14 @@ def xs(self, key, axis=1, copy=True, takeable=False): for blk in self.blocks: newb = make_block(values=blk.values[slicer], klass=blk.__class__, fastpath=True, - placement=blk.ref_locs) + placement=blk.mgr_locs) new_blocks.append(newb) elif len(self.blocks) == 1: block = self.blocks[0] vals = block.values[slicer] if copy: vals = vals.copy() - new_blocks = [make_block(values=vals, placement=block.ref_locs, + new_blocks = [make_block(values=vals, placement=block.mgr_locs, klass=block.__class__, fastpath=True,)] return self.__class__(new_blocks, new_axes) @@ -2515,8 +2490,8 @@ def fast_xs(self, loc): result = np.empty(n, dtype=dtype) for blk in self.blocks: # Such assignment may incorrectly coerce NaT to None - # result[blk.ref_locs] = blk._slice((slice(None), loc)) - for i, rl in enumerate(blk.ref_locs): + # result[blk.mgr_locs] = blk._slice((slice(None), loc)) + for i, rl in enumerate(blk.mgr_locs): result[rl] = blk._try_coerce_result(blk.iget((i, loc))) return result @@ -2538,12 +2513,11 @@ def consolidate(self): def _consolidate_inplace(self): if not self.is_consolidated(): - self.blocks = _consolidate(self.blocks) + self.blocks = tuple(_consolidate(self.blocks)) self._is_consolidated = True self._known_consolidated = True - self._set_has_sparse() - self._rebuild_ref_locs() + self._rebuild_blknos_and_blklocs() def get(self, item): """ @@ -2574,8 +2548,7 @@ def get(self, item): indexer=indexer, axis=0, allow_dups=True) def iget(self, i): - b, loc = self._ref_locs[i] - return b.iget(loc) + return self.blocks[self._blknos[i]].iget(self._blklocs[i]) def get_scalar(self, tup): """ @@ -2583,8 +2556,10 @@ def get_scalar(self, tup): """ full_loc = list(ax.get_loc(x) for ax, x in zip(self.axes, tup)) - blk, blk_loc = self._ref_locs[full_loc[0]] - full_loc[0] = blk_loc + blk = self.blocks[self._blknos[full_loc[0]]] + full_loc[0] = self._blklocs[full_loc[0]] + + # FIXME: this may return non-upcasted types? return blk.values[tuple(full_loc)] def delete(self, item): @@ -2595,29 +2570,35 @@ def delete(self, item): is_deleted = np.zeros(self.shape[0], dtype=np.bool_) is_deleted[indexer] = True - ref_loc_offset = is_deleted.cumsum() + ref_loc_offset = -is_deleted.cumsum() - new_items = self.items[~is_deleted] - new_blocks = [] + is_blk_deleted = [False] * len(self.blocks) - for blk in self.blocks: - brl = blk.ref_locs - blk_del = is_deleted[brl] - blk_del_count = np.count_nonzero(blk_del) + if isinstance(indexer, int): + affected_start = indexer + else: + affected_start = is_deleted.nonzero()[0][0] - if blk_del_count == len(brl): - continue + for blkno, _ in _fast_count_smallints(self._blknos[affected_start:]): + blk = self.blocks[blkno] + bml = blk.mgr_locs + blk_del = is_deleted[bml.indexer].nonzero()[0] - blk._ref_locs -= ref_loc_offset[brl] - if blk_del_count != 0: - blk = blk._getitem_block(~blk_del) + if len(blk_del) == len(bml): + is_blk_deleted[blkno] = True + continue + elif len(blk_del) != 0: + blk.delete(blk_del) + bml = blk.mgr_locs - new_blocks.append(blk) + blk.mgr_locs = bml.add(ref_loc_offset[bml.indexer]) - self.axes[0] = new_items - self.blocks = new_blocks + # FIXME: use Index.delete as soon as it uses fastpath=True + self.axes[0] = self.items[~is_deleted] + self.blocks = tuple(b for blkno, b in enumerate(self.blocks) + if not is_blk_deleted[blkno]) self._shape = None - self._rebuild_ref_locs() + self._rebuild_blknos_and_blklocs() def set(self, item, value, check=False): """ @@ -2626,22 +2607,22 @@ def set(self, item, value, check=False): if check, then validate that we are not setting the same data in-place """ # FIXME: refactor, clearly separate broadcasting & zip-like assignment - is_sparse = isinstance(value, SparseArray) + value_is_sparse = isinstance(value, SparseArray) - if is_sparse: + if value_is_sparse: assert self.ndim == 2 - def value_getitem(locs): + def value_getitem(placement): return value else: if value.ndim == self.ndim - 1: value = value.reshape((1,) + value.shape) - def value_getitem(locs): + def value_getitem(placement): return value else: - def value_getitem(locs): - return value[locs] + def value_getitem(placement): + return value[placement.indexer] if value.shape[1:] != self.shape[1:]: raise AssertionError('Shape of new values must be compatible ' 'with manager shape') @@ -2656,49 +2637,72 @@ def value_getitem(locs): if isinstance(loc, int): loc = [loc] - ref_locs = self._ref_locs[loc] + blknos = self._blknos[loc] + blklocs = self._blklocs[loc] unfit_mgr_locs = [] unfit_val_locs = [] - for blk, blk_locs, val_locs in ref_loc_groupby_block(ref_locs): + removed_blknos = [] + for blkno, val_locs in _get_blkno_placements(blknos, len(self.blocks), + group=True): + blk = self.blocks[blkno] + blk_locs = blklocs[val_locs.indexer] if blk.should_store(value): blk.set(blk_locs, value_getitem(val_locs), check=check) else: - unfit_mgr_locs.append(blk.ref_locs[blk_locs]) + unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs]) unfit_val_locs.append(val_locs) - new_blk_ref_locs = np.delete(blk.ref_locs, blk_locs, axis=0) - new_blk_len = len(new_blk_ref_locs) - if not new_blk_len: - self.blocks.remove(blk) + # If all block items are unfit, schedule the block for removal. + if len(val_locs) == len(blk.mgr_locs): + removed_blknos.append(blkno) else: - blk.values = np.delete(blk.values, blk_locs, axis=0) - blk._ref_locs = new_blk_ref_locs - self._ref_locs[new_blk_ref_locs] = \ - lib.fast_zip([np.array([blk] * new_blk_len), - np.arange(new_blk_len)]) + self._blklocs[blk.mgr_locs.indexer] = -1 + blk.delete(blk_locs) + self._blklocs[blk.mgr_locs.indexer] = np.arange(len(blk)) + + if len(removed_blknos): + # Remove blocks & update blknos accordingly + is_deleted = np.zeros(self.nblocks, dtype=np.bool_) + is_deleted[removed_blknos] = True + + new_blknos = np.empty(self.nblocks, dtype=np.int_) + new_blknos.fill(-1) + new_blknos[~is_deleted] = np.arange(self.nblocks - + len(removed_blknos)) + self._blknos = new_blknos.take(self._blknos, axis=0) + self.blocks = tuple(blk for i, blk in enumerate(self.blocks) + if i not in set(removed_blknos)) if unfit_val_locs: - unfit_val_locs = np.concatenate(unfit_val_locs) unfit_mgr_locs = np.concatenate(unfit_mgr_locs) - unfit_count = len(unfit_val_locs) - - if is_sparse: - for mgr_loc in unfit_mgr_locs: - new_block = make_block(values=value.copy(), - ndim=self.ndim, - placement=[mgr_loc]) - self.blocks.append(new_block) - self._ref_locs[mgr_loc] = (new_block, 0) + unfit_count = len(unfit_mgr_locs) + + new_blocks = [] + if value_is_sparse: + # This code (ab-)uses the fact that sparse blocks contain only + # one item. + new_blocks.extend( + make_block(values=value.copy(), ndim=self.ndim, + placement=slice(mgr_loc, mgr_loc + 1)) + for mgr_loc in unfit_mgr_locs) + + self._blknos[unfit_mgr_locs] = (np.arange(unfit_count) + + len(self.blocks)) + self._blklocs[unfit_mgr_locs] = 0 + else: - new_block = make_block(values=value[unfit_val_locs], - ndim=self.ndim, - placement=unfit_mgr_locs) + # unfit_val_locs contains BlockPlacement objects + unfit_val_items = unfit_val_locs[0].append(unfit_val_locs[1:]) - self.blocks.append(new_block) - self._ref_locs[unfit_mgr_locs] = lib.fast_zip([ - np.array([new_block] * unfit_count, dtype=np.object_), - np.arange(unfit_count)]) + new_blocks.append( + make_block(values=value_getitem(unfit_val_items), + ndim=self.ndim, placement=unfit_mgr_locs)) + + self._blknos[unfit_mgr_locs] = len(self.blocks) + self._blklocs[unfit_mgr_locs] = np.arange(unfit_count) + + self.blocks += tuple(new_blocks) # Newly created block's dtype may already be present. self._known_consolidated = False @@ -2723,132 +2727,169 @@ def insert(self, loc, item, value, allow_duplicates=False): if not isinstance(loc, int): raise TypeError("loc must be int") - new_items = self.items.insert(loc, item) block = make_block(values=value, ndim=self.ndim, - placement=[loc]) - new_ref_locs = np.insert(self._ref_locs, loc, None, axis=0) - new_ref_locs[loc] = (block, 0) + placement=slice(loc, loc+1)) - for blk in self.blocks: - blk._ref_locs[blk._ref_locs >= loc] += 1 + for blkno, count in _fast_count_smallints(self._blknos[loc:]): + blk = self.blocks[blkno] + if count == len(blk.mgr_locs): + blk.mgr_locs = blk.mgr_locs.add(1) + else: + new_mgr_locs = blk.mgr_locs.as_array.copy() + new_mgr_locs[new_mgr_locs >= loc] += 1 + blk.mgr_locs = new_mgr_locs + + if loc == self._blklocs.shape[0]: + # np.append is a lot faster (at least in numpy 1.7.1), let's use it + # if we can. + self._blklocs = np.append(self._blklocs, 0) + self._blknos = np.append(self._blknos, len(self.blocks)) + else: + self._blklocs = np.insert(self._blklocs, loc, 0) + self._blknos = np.insert(self._blknos, loc, len(self.blocks)) - self.blocks.append(block) - self.axes[0] = new_items + self.axes[0] = self.items.insert(loc, item) + + self.blocks += (block,) self._shape = None - self._ref_locs = new_ref_locs self._known_consolidated = False if len(self.blocks) > 100: self._consolidate_inplace() - def reindex_axis(self, new_axis, axis, method=None, limit=None, + def reindex_axis(self, new_index, axis, method=None, limit=None, fill_value=None, copy=True): - mgr = self if not copy else self.copy(deep=True) - - new_axis = _ensure_index(new_axis) - new_axis, indexer = mgr.axes[axis].reindex( - new_axis, method=method, limit=limit, copy_if_needed=True) + """ + Conform block manager to new index. + """ + new_index = _ensure_index(new_index) + new_index, indexer = self.axes[axis].reindex( + new_index, method=method, limit=limit, copy_if_needed=True) - return mgr.reindex_indexer(new_axis, indexer, axis=axis, - fill_value=fill_value) + return self.reindex_indexer(new_index, indexer, axis=axis, + fill_value=fill_value, copy=copy) def reindex_indexer(self, new_axis, indexer, axis, fill_value=None, - allow_dups=False): + allow_dups=False, copy=True): """ + Parameters + ---------- + new_axis : Index + indexer : ndarray of int64 or None + axis : int + fill_value : object + allow_dups : bool + pandas-indexer with -1's only. """ + + if indexer is None: + if new_axis is self.axes[axis] and not copy: + return self + + result = self.copy(deep=copy) + result.axes = list(self.axes) + result.axes[axis] = new_axis + return result + + self._consolidate_inplace() + # trying to reindex on an axis with duplicates if (not allow_dups and not self.axes[axis].is_unique - and indexer is not None and len(indexer)): + and len(indexer)): raise ValueError("cannot reindex from a duplicate axis") if axis >= self.ndim: - raise AssertionError("Requested axis not found in manager") - - # FIXME: this code comes from generic.py, see if any of that is needed - # elif (baxis == 0 and - # index is not new_data.axes[baxis]): - # new_data = new_data.reindex_items(index, copy=copy, - # fill_value=fill_value) - - # elif (baxis > 0 and index is not None and - # index is not new_data.axes[baxis]): - # new_data = new_data.copy(deep=copy) - # new_data.set_axis(baxis, index) + raise IndexError("Requested axis not found in manager") if axis == 0: - new_blocks = self._get_blocks_for_items_indexer(indexer, - fill_value) + new_blocks = self._slice_take_blocks_ax0( + indexer, fill_tuple=(fill_value,)) else: - # TODO: is this faster than blk.reindex_axis? - # return self.apply('take', - # axes=new_axes, - # indexer=indexer, - # ref_items=new_axes[0], - # new_axis=new_axes[axis], - # axis=axis) - new_blocks = [blk.reindex_axis(indexer, axis=axis, - fill_value=fill_value) + new_blocks = [blk.take_nd(indexer, axis=axis, + fill_tuple=(fill_value if fill_value is not None else + blk.fill_value,)) for blk in self.blocks] new_axes = list(self.axes) new_axes[axis] = new_axis return self.__class__(new_blocks, new_axes) - def _get_blocks_for_items_indexer(self, indexer, fill_value): + def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): """ - Reindex blocks at axis=0 (overloaded for SingleBlockManager). + Slice/take blocks along axis=0. + + Overloaded for SingleBlock Returns ------- new_blocks : list of Block """ - # fill_value[0] == None will group soon-to-be-added items under None - # fill_value[1] is an arbitrary integer (it's ignored) - new_ref_locs = com.take_1d(self._ref_locs, indexer, - fill_value=(None, 0)) - new_blocks = [] - for blk, blk_locs, mgr_locs in ref_loc_groupby_block(new_ref_locs): - if blk is None: - new_blocks.append(self._make_na_block( + + allow_fill = fill_tuple is not None + + sl_type, slobj, sllen = _preprocess_slice_or_indexer( + slice_or_indexer, self.shape[0], allow_fill=allow_fill) + + if self._is_single_block: + blk = self.blocks[0] + + if sl_type in ('slice', 'mask'): + return [blk.getitem_block(slobj, + new_mgr_locs=slice(0, sllen))] + elif not allow_fill or self.ndim == 1: + if allow_fill and fill_tuple[0] is None: + _, fill_value = com._maybe_promote(blk.dtype) + fill_tuple = (fill_value,) + + return [blk.take_nd(slobj, axis=0, + new_mgr_locs=slice(0, sllen), + fill_tuple=fill_tuple)] + + if sl_type in ('slice', 'mask'): + blknos = self._blknos[slobj] + blklocs = self._blklocs[slobj] + else: + blknos = com.take_1d(self._blknos, slobj, fill_value=-1, + allow_fill=allow_fill) + blklocs = com.take_1d(self._blklocs, slobj, fill_value=-1, + allow_fill=allow_fill) + + # When filling blknos, make sure blknos is updated before appending to + # blocks list, that way new blkno is exactly len(blocks). + # + # FIXME: mgr_groupby_blknos must return mgr_locs in ascending order, + # pytables serialization will break otherwise. + blocks = [] + for blkno, mgr_locs in _get_blkno_placements(blknos, len(self.blocks), + group=True): + if blkno == -1: + # If we've got here, fill_tuple was not None. + fill_value = fill_tuple[0] + + blocks.append(self._make_na_block( placement=mgr_locs, fill_value=fill_value)) else: + blk = self.blocks[blkno] + # Otherwise, slicing along items axis is necessary. if blk.is_sparse: - # If it's a sparse block, it's easy: - # - # - it can only contain 1 item - # - if blk is here, the item wasn't deleted - # - if blk wasn't handled above, the item is multiplied - # - # Hence the block is replicated. + # A sparse block, it's easy, because there's only one item + # and each mgr loc is a copy of that single item. for mgr_loc in mgr_locs: newblk = blk.copy(deep=True) - newblk._ref_locs = np.array([mgr_loc]) - new_blocks.append(newblk) + newblk.mgr_locs = slice(mgr_loc, mgr_loc + 1) + blocks.append(newblk) else: - # FIXME: this hack makes sure post-reindex blocks enumerate - # manager locs in ascending order. It was implemented to - # make pytables serialization test happy and should be - # removed once the codebase successfully switches to - # axis-oblivious blocks & blockmanagers. - order = np.argsort(mgr_locs) - blk_locs = blk_locs.take(order) - mgr_locs = mgr_locs.take(order) - - new_values = com.take_1d(blk.values, blk_locs, - axis=0, allow_fill=False) - newblk = blk.__class__(values=new_values, - ndim=blk.ndim, - fastpath=True, - placement=mgr_locs,) - new_blocks.append(newblk) - - return new_blocks + blocks.append(blk.take_nd( + blklocs[mgr_locs.indexer], axis=0, + new_mgr_locs=mgr_locs, fill_tuple=None)) + + return blocks def _make_na_block(self, placement, fill_value=None): # TODO: infer dtypes other than float64 from fill_value @@ -2891,13 +2932,14 @@ def merge(self, other, lsuffix='', rsuffix=''): right=other.items, rsuffix=rsuffix) new_items = _concat_indexes([l, r]) - new_blocks = [] - for blocks, offset in [(self.blocks, 0), - (other.blocks, self.shape[0])]: - for blk in blocks: - blk = blk.copy(deep=False) - blk._ref_locs += offset - new_blocks.append(blk) + new_blocks = [blk.copy(deep=False) + for blk in self.blocks] + + offset = self.shape[0] + for blk in other.blocks: + blk = blk.copy(deep=False) + blk.mgr_locs = blk.mgr_locs.add(offset) + new_blocks.append(blk) new_axes = list(self.axes) new_axes[0] = new_items @@ -2916,39 +2958,6 @@ def _is_indexed_like(self, other): return False return True - def rename_axis(self, mapper, axis, copy=True): - """ - Rename one of axes. - - Parameters - ---------- - mapper : unary callable - axis : int - copy : boolean, default True - - """ - new_axis = _transform_index(self.axes[axis], mapper) - - if axis != 0: - new_blocks = self.blocks - else: - new_blocks = [] - for block in self.blocks: - newb = block.copy(deep=copy) - new_blocks.append(newb) - - new_axes = list(self.axes) - new_axes[axis] = new_axis - return self.__class__(new_blocks, new_axes) - - def add_prefix(self, prefix): - f = (('%s' % prefix) + '%s').__mod__ - return self.rename_axis(f, axis=0) - - def add_suffix(self, suffix): - f = ('%s' + ('%s' % suffix)).__mod__ - return self.rename_axis(f, axis=0) - def equals(self, other): self_axes, other_axes = self.axes, other.axes if len(self_axes) != len(other_axes): @@ -2960,26 +2969,16 @@ def equals(self, other): return all(block.equals(oblock) for block, oblock in zip(self.blocks, other.blocks)) - def group_blocks_by_ftype(self): - """ - Combine blocks into map: ftype -> [blk0, blk1, ...]. - - """ - bm = defaultdict(list) - for b in self.blocks: - bm[str(b.ftype)].append(b) - return bm - class SingleBlockManager(BlockManager): - """ manage a single block with """ + ndim = 1 _is_consolidated = True _known_consolidated = True - __slots__ = ['axes', 'blocks'] + __slots__ = () - def __init__(self, block, axis, do_integrity_check=False, fastpath=True): + def __init__(self, block, axis, do_integrity_check=False, fastpath=False): if isinstance(axis, list): if len(axis) != 1: @@ -2999,12 +2998,7 @@ def __init__(self, block, axis, do_integrity_check=False, fastpath=True): raise ValueError('Cannot create SingleBlockManager with ' 'more than 1 block') block = block[0] - if not isinstance(block, Block): - block = make_block(block, ndim=1, fastpath=True, - placement=np.arange(len(axis))) - else: - self.axes = [_ensure_index(axis)] # create the block here @@ -3021,9 +3015,10 @@ def __init__(self, block, axis, do_integrity_check=False, fastpath=True): 'more than 1 block') block = block[0] - if not isinstance(block, Block): - block = make_block(block, axis, ndim=1, - fastpath=True, placement=None) + if not isinstance(block, Block): + block = make_block(block, + placement=slice(0, len(axis)), + ndim=1, fastpath=True) self.blocks = [block] @@ -3038,26 +3033,6 @@ def _block(self): def _values(self): return self._block.values - @property - def _has_sparse(self): - return self._block.is_sparse - - def _set_has_sparse(self): - # _has_sparse is a property, nothing to set here - pass - - # def apply(self, f, axes=None, do_integrity_check=False, **kwargs): - # """ - # fast path for SingleBlock Manager - - # ssee also BlockManager.apply - # """ - # applied = getattr(self._block, f)(**kwargs) - # bm = self.__class__(applied, axes or self.axes, - # do_integrity_check=do_integrity_check) - # bm._consolidate_inplace() - # return bm - def reindex(self, new_axis, indexer=None, method=None, fill_value=None, limit=None, copy=True): # if we are the same and don't copy, just return @@ -3088,44 +3063,19 @@ def reindex(self, new_axis, indexer=None, method=None, fill_value=None, limit=limit, fill_value=fill_value) if self._block.is_sparse: - make_block = self._block.make_block + make_block = self._block.make_block_same_class block = make_block(new_values, copy=copy, - placement=np.arange(len(new_axis))) + placement=slice(0, len(new_axis))) - # block = self._block.reindex_items_from(new_axis, indexer=indexer, - # method=method, - # fill_value=fill_value, - # limit=limit, copy=copy) mgr = SingleBlockManager(block, new_axis) mgr._consolidate_inplace() return mgr - def _reindex_indexer_items(self, new_items, indexer, fill_value): - # equiv to a reindex - return self.reindex(new_items, indexer=indexer, fill_value=fill_value, - copy=False) - - def _delete_from_block(self, i, item): - super(SingleBlockManager, self)._delete_from_block(i, item) - - # possibly need to merge split blocks - if len(self.blocks) > 1: - new_values = np.concatenate([b.values for b in self.blocks]) - new_items = Index(np.concatenate([b.items for b in self.blocks])) - - block = make_block(values=new_values, placement=None, - dtype=self._block.dtype,) - - elif len(self.blocks): - block = self.blocks[0] - else: - block = make_block(values=np.array([], dtype=self._block.dtype), - placement=None) - - self.blocks = [block] + def get_slice(self, slobj, axis=0): + if axis >= self.ndim: + raise IndexError("Requested axis not found in manager") - def get_slice(self, slobj): return self.__class__(self._block._slice(slobj), self.index[slobj], fastpath=True) @@ -3153,10 +3103,10 @@ def get_ftype_counts(self): return {self.ftype: 1} def get_dtypes(self): - return [self._block.dtype] + return np.array([self._block.dtype]) def get_ftypes(self): - return [self._block.ftype] + return np.array([self._block.ftype]) @property def values(self): @@ -3185,15 +3135,9 @@ def delete(self, item): Ensures that self.blocks doesn't become empty. """ - # Also, make sure dtype is preserved. - dtype = self._block.dtype - - super(SingleBlockManager, self).delete(item) - - if not self.blocks: - self.blocks = [make_block(values=np.empty(0, dtype=dtype), - placement=np.arange(len(self.items)), - ndim=1, dtype=dtype, fastpath=True)] + loc = self.items.get_loc(item) + self._block.delete(loc) + self.axes[0] = self.axes[0].delete(loc) def fast_xs(self, loc): """ @@ -3202,25 +3146,6 @@ def fast_xs(self, loc): """ return self._block.values[loc] - def _get_blocks_for_items_indexer(self, indexer, fill_value): - """ - Reindex blocks at axis=0 (overloaded for SingleBlockManager). - - Returns - ------- - new_blocks : list of Block - - """ - if indexer is None: - new_values = self._values.copy() - else: - new_values = com.take_1d(self._values, indexer, - fill_value=fill_value) - - return [make_block(values=new_values, - placement=np.arange(len(new_values)), - ndim=self.ndim, fastpath=True)] - def construction_error(tot_items, block_shape, axes, e=None): """ raise a helpful message about our construction """ @@ -3239,7 +3164,7 @@ def create_block_manager_from_blocks(blocks, axes): # basically "all items", but if there're many, don't bother # converting, it's an error anyway. blocks = [make_block(values=blocks[0], - placement=np.arange(len(axes[0])),)] + placement=slice(0, len(axes[0])))] mgr = BlockManager(blocks, axes) mgr._consolidate_inplace() @@ -3526,15 +3451,17 @@ def _merge_blocks(blocks, dtype=None, _can_consolidate=True): raise AssertionError("_merge_blocks are invalid!") dtype = blocks[0].dtype - new_ref_locs = np.concatenate([b.ref_locs for b in blocks]) + # FIXME: optimization potential in case all mgrs contain slices and + # combination of those slices is a slice, too. + new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks]) new_values = _vstack([b.values for b in blocks], dtype) - argsort = np.argsort(new_ref_locs) + argsort = np.argsort(new_mgr_locs) new_values = new_values[argsort] - new_ref_locs = new_ref_locs[argsort] + new_mgr_locs = new_mgr_locs[argsort] return make_block(new_values, - fastpath=True, placement=new_ref_locs) + fastpath=True, placement=new_mgr_locs) # no merge return blocks @@ -3560,14 +3487,6 @@ def _vstack(to_stack, dtype): return np.vstack(to_stack) -def _possibly_convert_to_indexer(loc): - if com._is_bool_indexer(loc): - loc = [i for i, v in enumerate(loc) if v] - elif isinstance(loc, slice): - loc = lrange(loc.start, loc.stop) - return loc - - def _possibly_compare(a, b, op): res = op(a, b) is_a_array = isinstance(a, np.ndarray) @@ -3650,33 +3569,25 @@ def _invert_reordering(reordering, minlength=None): return inverted -def ref_loc_groupby_block(ref_locs): +def _get_blkno_placements(blknos, blk_count, group=True): """ - Group given ref_locs by block. + + Parameters + ---------- + blknos : array of int64 + blk_count : int + group : bool Returns ------- iterator - Yield (block, block_locs, original_locs) + yield (BlockPlacement, blkno) """ - if len(ref_locs) == 0: - return - blocks = com._ensure_object(lib.map_infer(ref_locs, - operator.itemgetter(0))) - indices = lib.map_infer(ref_locs, operator.itemgetter(1)) - - factorizer = Factorizer(len(blocks)) - block_ids = factorizer.factorize(blocks, na_sentinel=-1) - - for i in range(factorizer.get_count()): - locs = (block_ids == i).nonzero()[0] - yield blocks[locs[0]], indices[locs], locs - - na_locs = (block_ids == -1).nonzero()[0] - if len(na_locs): - yield None, indices[na_locs], na_locs + # FIXME: blk_count is unused, but it may avoid the use of dicts in cython + for blkno, indexer in lib.get_blkno_indexers(blknos, group): + yield blkno, BlockPlacement(indexer) def items_overlap_with_suffix(left, lsuffix, right, rsuffix): @@ -3774,14 +3685,14 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): copy : bool """ - concat_plans = [] + concat_plan = combine_concat_plans([get_mgr_concatenation_plan(mgr, indexers) + for mgr, indexers in mgrs_indexers], + concat_axis) - for mgr, indexers in mgrs_indexers: - plan = get_mgr_concatenation_plan(mgr, indexers) - concat_plans = combine_concat_plans(concat_plans, plan, concat_axis) - - blocks = [concatenate_by_plan(plan, concat_axis, copy=copy) - for plan in concat_plans] + blocks = [make_block(concatenate_join_units(join_units, concat_axis, + copy=copy), + placement=placement) + for placement, join_units in concat_plan] return BlockManager(blocks, axes) @@ -3798,20 +3709,32 @@ def get_empty_dtype_and_na(join_units): na """ + if len(join_units) == 1: + blk = join_units[0].block + if blk is None: + return np.float64, np.nan + else: + return blk.dtype, None + has_none_blocks = False - dtypes = set() - upcast_classes = set() - null_upcast_classes = set() - for unit in join_units: + dtypes = [None] * len(join_units) + + for i, unit in enumerate(join_units): if unit.block is None: - # This value is not supposed to be used anywhere, it's here to make - # sure "monotype" check (len(dtypes) == 1) fails and to indicate - # that upcasting is required. has_none_blocks = True - continue + else: + dtypes[i] = unit.dtype + + if not has_none_blocks and len(set(dtypes)) == 1: + # Unanimous decision, nothing to upcast. + return dtypes[0], None - dtype = unit.dtype - dtypes.add(unit.dtype) + # dtypes = set() + upcast_classes = set() + null_upcast_classes = set() + for dtype, unit in zip(dtypes, join_units): + if dtype is None: + continue if issubclass(dtype.type, (np.object_, np.bool_)): upcast_cls = 'object' @@ -3830,10 +3753,6 @@ def get_empty_dtype_and_na(join_units): else: upcast_classes.add(upcast_cls) - if not has_none_blocks and len(dtypes) == 1: - # Unanimous decision, nothing to upcast. - return next(iter(dtypes)), None - if not upcast_classes: upcast_classes = null_upcast_classes @@ -3850,11 +3769,13 @@ def get_empty_dtype_and_na(join_units): raise AssertionError("invalid dtype determination in get_concat_dtype") -def concatenate_by_plan(plan, concat_axis, copy): +def concatenate_join_units(join_units, concat_axis, copy): """ - Make block from concatenation plan. + Concatenate values from several join units along selected axis. """ - concat_start, join_units = plan + if concat_axis == 0 and len(join_units) > 1: + # Concatenating join units along ax0 is handled in _merge_blocks. + raise AssertionError("Concatenating join units along axis0") empty_dtype, upcasted_na = get_empty_dtype_and_na(join_units) @@ -3864,20 +3785,20 @@ def concatenate_by_plan(plan, concat_axis, copy): if len(to_concat) == 1: # Only one block, nothing to concatenate. - if copy: - concat_values = to_concat[0].copy() - else: - concat_values = to_concat[0] + concat_values = to_concat[0] + if copy and concat_values.base is not None: + concat_values = concat_values.copy() else: concat_values = com._concat_compat(to_concat, axis=concat_axis) - rng = np.arange(concat_values.shape[0]) - + # FIXME: optimization potential: if len(join_units) == 1, single join unit + # is densified and sparsified back. if any(unit.is_sparse for unit in join_units): - concat_values = SparseArray(concat_values[0]) - - return make_block(concat_values, - placement=rng + concat_start) + # If one of the units was sparse, concat_values are 2d and there's only + # one item. + return SparseArray(concat_values[0]) + else: + return concat_values def get_mgr_concatenation_plan(mgr, indexers): @@ -3891,7 +3812,7 @@ def get_mgr_concatenation_plan(mgr, indexers): Returns ------- - plan : list of (start_loc, [JoinUnit]) tuples + plan : list of (BlockPlacement, JoinUnit) tuples """ # Calculate post-reindex shape , save for item axis which will be separate @@ -3899,109 +3820,124 @@ def get_mgr_concatenation_plan(mgr, indexers): mgr_shape = list(mgr.shape) for ax, indexer in indexers.items(): mgr_shape[ax] = len(indexer) + mgr_shape = tuple(mgr_shape) if 0 in indexers: - indexer = indexers.pop(0) - ref_locs = com.take_1d(mgr._ref_locs, indexer, fill_value=(None, 0)) + ax0_indexer = indexers.pop(0) + blknos = com.take_1d(mgr._blknos, ax0_indexer, fill_value=-1) + blklocs = com.take_1d(mgr._blklocs, ax0_indexer, fill_value=-1) else: - ref_locs = mgr._ref_locs + + if mgr._is_single_block: + blk = mgr.blocks[0] + return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))] + + ax0_indexer = None + blknos = mgr._blknos + blklocs = mgr._blklocs plan = [] - for blk, blk_locs, concat_locs in ref_loc_groupby_block(ref_locs): - # result_locs are assumed to be sorted - slices = locs_to_contiguous_sequences(concat_locs) + for blkno, placements in _get_blkno_placements(blknos, len(mgr.blocks), + group=False): + assert placements.is_slice_like - for slc in slices: - join_unit_indexers = indexers.copy() - axis0_blk_indexer = blk_locs[slc] + join_unit_indexers = indexers.copy() + + shape = list(mgr_shape) + shape[0] = len(placements) + shape = tuple(shape) + + if blkno == -1: + unit = JoinUnit(None, shape) + else: + blk = mgr.blocks[blkno] + ax0_blk_indexer = blklocs[placements.indexer] + + unit_no_ax0_reindexing = ( + len(placements) == len(blk.mgr_locs) and + # Fastpath detection of join unit not needing to reindex its + # block: no ax0 reindexing took place and block placement was + # sequential before. + ((ax0_indexer is None + and blk.mgr_locs.is_slice_like + and blk.mgr_locs.as_slice.step == 1) or + # Slow-ish detection: all indexer locs are sequential (and + # length match is checked above). + (np.diff(ax0_blk_indexer) == 1).all())) # Omit indexer if no item reindexing is required. - if (blk is None or - np.array_equal(axis0_blk_indexer, np.arange(blk.shape[0]))): + if unit_no_ax0_reindexing: join_unit_indexers.pop(0, None) else: - join_unit_indexers[0] = axis0_blk_indexer + join_unit_indexers[0] = ax0_blk_indexer - blk_shape = copy.copy(mgr_shape) - blk_shape[0] = len(axis0_blk_indexer) - unit = JoinUnit(blk, join_unit_indexers, shape=blk_shape) + unit = JoinUnit(blk, shape, join_unit_indexers) - plan.append((concat_locs[slc.start], [unit])) + plan.append((placements, unit)) - plan.sort() return plan -def combine_concat_plans(existing_plan, new_plan, concat_axis): +def combine_concat_plans(plans, concat_axis): """ Combine multiple concatenation plans into one. existing_plan is updated in-place. """ - if not existing_plan: - # Shortcut: nothing to combine with - return new_plan - - if concat_axis == 0: - # Another shortcut: when concatenating along item axis, plans can be - # simply appended. - last_offset, last_units = existing_plan[-1] - plan_offset = last_offset + last_units[0].shape[0] - return existing_plan + [(off_i + plan_offset, units_i) - for off_i, units_i in new_plan] - - from collections import deque - old_items = deque(existing_plan) - new_items = deque(new_plan) - result = [] + if len(plans) == 1: + for p in plans[0]: + yield p[0], [p[1]] - while new_items: - old_start, old_units = old_items.popleft() - new_start, new_units = new_items.popleft() + elif concat_axis == 0: + offset = 0 + for plan in plans: + last_plc = None - assert old_start == new_start + for plc, unit in plan: + yield plc.add(offset), [unit] + last_plc = plc - old_len = old_units[0].shape[0] - new_len = new_units[0].shape[0] + if last_plc is not None: + offset += last_plc.as_slice.stop - # Trim either old or new part as necessary - common_len = min(old_len, new_len) - if new_len > common_len: - new_items.appendleft((new_start + common_len, - [trim_join_unit(unit, common_len) - for unit in new_units])) - elif old_len > common_len: - old_items.appendleft((old_start + common_len, - [trim_join_unit(unit, common_len) - for unit in old_units])) + else: + num_ended = [0] + def _next_or_none(seq): + retval = next(seq, None) + if retval is None: + num_ended[0] += 1 + return retval - result.append((old_start, old_units + new_units)) + plans = list(map(iter, plans)) + next_items = list(map(_next_or_none, plans)) - # The loop terminates when there's no new items, make sure that all old - # items are processed. - assert not old_items + while num_ended[0] != len(next_items): + if num_ended[0] > 0: + raise ValueError("Plan shapes are not aligned") - return result + placements, units = zip(*next_items) + lengths = list(map(len, placements)) + min_len, max_len = min(lengths), max(lengths) -def locs_to_contiguous_sequences(locs): - """ - Return contiguous sequences found in locs as slices. - """ - # FIXME: the code looks vaguely familiar, maybe there another version that - # can be reused instead - assert locs.ndim == 1 - length = len(locs) - - diff = np.diff(locs, axis=0) - break_locs = (diff != 1).nonzero()[0] + 1 + if min_len == max_len: + yield placements[0], units + next_items[:] = map(_next_or_none, plans) + else: + yielded_placement = None + yielded_units = [None] * len(next_items) + for i, (plc, unit) in enumerate(next_items): + yielded_units[i] = unit + if len(plc) > min_len: + # trim_join_unit updates unit in place, so only + # placement needs to be sliced to skip min_len. + next_items[i] = (plc[min_len:], + trim_join_unit(unit, min_len)) + else: + yielded_placement = plc + next_items[i] = _next_or_none(plans[i]) - if len(break_locs) == 0: - return [slice(0, length)] - else: - return [slice(b, e) - for b, e in lib.fast_zip([np.r_[0, break_locs], - np.r_[break_locs, length]])] + yield yielded_placement, yielded_units def trim_join_unit(join_unit, length): @@ -4012,23 +3948,29 @@ def trim_join_unit(join_unit, length): """ if 0 not in join_unit.indexers: - join_unit.indexers[0] = np.arange(join_unit.shape[0]) + extra_indexers = join_unit.indexers - extra_indexers = copy.copy(join_unit.indexers) - extra_shape = copy.copy(join_unit.shape) + if join_unit.block is None: + extra_block = None + else: + extra_block = join_unit.block.getitem_block(slice(length, None)) + join_unit.block = join_unit.block.getitem_block(slice(length)) + else: + extra_block = join_unit.block - extra_shape[0] = join_unit.shape[0] - length - extra_indexers[0] = extra_indexers[0][length:] + extra_indexers = copy.copy(join_unit.indexers) + extra_indexers[0] = extra_indexers[0][length:] + join_unit.indexers[0] = join_unit.indexers[0][:length] - join_unit.shape[0] = length - join_unit.indexers[0] = join_unit.indexers[0][:length] + extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:] + join_unit.shape = (length,) + join_unit.shape[1:] - return JoinUnit(block=join_unit.block, indexers=extra_indexers, + return JoinUnit(block=extra_block, indexers=extra_indexers, shape=extra_shape) class JoinUnit(object): - def __init__(self, block, indexers, shape): + def __init__(self, block, shape, indexers={}): # Passing shape explicitly is required for cases when block is None. self.block = block self.indexers = indexers @@ -4061,42 +4003,66 @@ def dtype(self): @cache_readonly def is_null(self): - return self.block is None or isnull(self.block.values).all() + if self.block is None: + return True + + if not self.block._can_hold_na: + return False + + # Usually it's enough to check but a small fraction of values to see if + # a block is NOT null, chunks should help in such cases. 1000 value + # was chosen rather arbitrarily. + values_flat = self.block.values.ravel() + total_len = values_flat.shape[0] + chunk_len = max(total_len // 40, 1000) + for i in range(0, total_len, chunk_len): + if not isnull(values_flat[i: i + chunk_len]).all(): + return False + + return True @cache_readonly def is_sparse(self): return self.block is not None and self.block.is_sparse def get_reindexed_values(self, empty_dtype, upcasted_na): - if upcasted_na is not None: - fill_value = upcasted_na - else: - # If upcasted_na is None, self.block should always exist. If it - # doesn't (i.e. is None), then it's a bug in get_empty_dtype_and_na - # function. + if upcasted_na is None: + # No upcasting is necessary fill_value = self.block.fill_value - - if self.is_null: - missing_arr = np.empty(self.shape, dtype=empty_dtype) - if np.prod(self.shape): - # NumPy 1.6 workaround: this statement gets strange if all - # blocks are of same dtype and some of them are empty: empty - # one are considered "null" so they must be filled, but no - # dtype upcasting happens and the dtype may not allow NaNs. - # - # In general, no one should get hurt when one tries to put - # incorrect values into empty array, but numpy 1.6 is strict - # about that. - missing_arr.fill(fill_value) - return missing_arr + values = self.block.get_values() else: - if upcasted_na is not None and self.block.is_bool: + fill_value = upcasted_na + + if self.is_null: + missing_arr = np.empty(self.shape, dtype=empty_dtype) + if np.prod(self.shape): + # NumPy 1.6 workaround: this statement gets strange if all + # blocks are of same dtype and some of them are empty: + # empty one are considered "null" so they must be filled, + # but no dtype upcasting happens and the dtype may not + # allow NaNs. + # + # In general, no one should get hurt when one tries to put + # incorrect values into empty array, but numpy 1.6 is + # strict about that. + missing_arr.fill(fill_value) + return missing_arr + + if self.block.is_bool: # External code requested filling/upcasting, bool values must # be upcasted to object to avoid being upcasted to numeric. values = self.block.astype(np.object_).values else: + # No dtype upcasting is done here, it will be performed during + # concatenation itself. values = self.block.get_values() + if not self.indexers: + # If there's no indexing to be done, we want to signal outside + # code that this array must be copied explicitly. This is done + # by returning a view and checking `retval.base`. + return values.view() + else: for ax, indexer in self.indexers.items(): values = com.take_nd(values, indexer, axis=ax, fill_value=fill_value) @@ -4104,20 +4070,26 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): return values -# def _align_kwargs(blocks, items, kwargs, align_keys, copy): -# aligned_objs = dict((k, kwargs[k]) for k in align_keys.items() -# if hasattr(kwargs[k], 'reindex_axis')) - -# if aligned_objs: -# kwargs = kwargs.copy() - -# for b in blocks: -# if aligned_objs: -# b_items = items.take(b.ref_locs) - -# for k, obj in aligned_objs.items(): -# axis = getattr(obj, '_info_axis_number', 0) -# kwargs[k] = obj.reindex_axis(b_items, axis=axis, -# copy=copy) - -# yield b, kwargs +def _fast_count_smallints(arr): + """Faster version of set(arr) for sequences of small numbers.""" + if len(arr) == 0: + # Handle empty arr case separately: numpy 1.6 chokes on that. + return np.empty((0, 2), dtype=arr.dtype) + else: + counts = np.bincount(arr) + nz = counts.nonzero()[0] + return np.c_[nz, counts[nz]] + + +def _preprocess_slice_or_indexer(slice_or_indexer, length, allow_fill): + if isinstance(slice_or_indexer, slice): + return 'slice', slice_or_indexer, lib.slice_len(slice_or_indexer, + length) + elif (isinstance(slice_or_indexer, np.ndarray) and + slice_or_indexer.dtype == np.bool_): + return 'mask', slice_or_indexer, slice_or_indexer.sum() + else: + indexer = np.asanyarray(slice_or_indexer, dtype=np.int_) + if not allow_fill: + indexer = _maybe_convert_indices(indexer, length) + return 'fancy', indexer, len(indexer) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 3a977757b68ae..196b80a83723f 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -447,7 +447,7 @@ def _unstack_frame(obj, level): new_blocks = [] mask_blocks = [] for blk in obj._data.blocks: - blk_items = obj._data.items.take(blk.ref_locs) + blk_items = obj._data.items[blk.mgr_locs.indexer] bunstacker = _Unstacker(blk.values.T, obj.index, level=level, value_columns=blk_items) new_items = bunstacker.get_new_columns() diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 78f577566a28e..7da86565b51cd 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -356,7 +356,7 @@ def encode(obj): return {'typ': 'block_manager', 'klass': obj.__class__.__name__, 'axes': data.axes, - 'blocks': [{'items': data.items.take(b.ref_locs), + 'blocks': [{'items': data.items.take(b.mgr_locs), 'values': convert(b.values), 'shape': b.values.shape, 'dtype': b.dtype.num, diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index e49ab3884d312..95daa2bbc2752 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2669,7 +2669,7 @@ def write(self, obj, **kwargs): self.attrs.nblocks = len(data.blocks) for i, blk in enumerate(data.blocks): # I have no idea why, but writing values before items fixed #2299 - blk_items = data.items.take(blk.ref_locs) + blk_items = data.items.take(blk.mgr_locs) self.write_array('block%d_values' % i, blk.values, items=blk_items) self.write_index('block%d_items' % i, blk_items) @@ -3192,7 +3192,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, obj = _reindex_axis(obj, a[0], a[1]) def get_blk_items(mgr, blocks): - return [mgr.items.take(blk.ref_locs) for blk in blocks] + return [mgr.items.take(blk.mgr_locs) for blk in blocks] # figure out data_columns and get out blocks block_obj = self.get_object(obj).consolidate() @@ -3208,7 +3208,7 @@ def get_blk_items(mgr, blocks): axis=axis )._data - blocks = mgr.blocks + blocks = list(mgr.blocks) blk_items = get_blk_items(mgr, blocks) for c in data_columns: mgr = block_obj.reindex_axis([c], axis=axis)._data diff --git a/pandas/lib.pyx b/pandas/lib.pyx index a1fef095ea277..0bac4f8011420 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -19,6 +19,17 @@ from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem, PyTuple_New, PyObject_SetAttrString) +cdef extern from "Python.h": + ctypedef struct PySliceObject: + pass + + cdef int PySlice_GetIndicesEx( + PySliceObject* s, Py_ssize_t length, + Py_ssize_t *start, Py_ssize_t *stop, Py_ssize_t *step, + Py_ssize_t *slicelength) except -1 + + + cimport cpython isnan = np.isnan @@ -1232,6 +1243,419 @@ def indices_fast(object index, ndarray[int64_t] labels, list keys, return result + +@cython.boundscheck(False) +@cython.wraparound(False) +def get_blkno_indexers(int64_t[:] blknos, bint group=True): + """ + Enumerate contiguous runs of integers in ndarray. + + Iterate over elements of `blknos` yielding ``(blkno, slice(start, stop))`` + pairs for each contiguous run found. + + If `group` is True and there is more than one run for a certain blkno, + ``(blkno, array)`` with an array containing positions of all elements equal + to blkno. + + Returns + ------- + iter : iterator of (int, slice or array) + + """ + # There's blkno in this function's name because it's used in block & + # blockno handling. + cdef: + int64_t cur_blkno + Py_ssize_t i, start, stop, n, diff + + list group_order + dict group_slices + int64_t[:] res_view + + n = blknos.shape[0] + + if n > 0: + start = 0 + cur_blkno = blknos[start] + + if group == False: + for i in range(1, n): + if blknos[i] != cur_blkno: + yield cur_blkno, slice(start, i) + + start = i + cur_blkno = blknos[i] + + yield cur_blkno, slice(start, n) + else: + group_order = [] + group_dict = {} + + for i in range(1, n): + if blknos[i] != cur_blkno: + if cur_blkno not in group_dict: + group_order.append(cur_blkno) + group_dict[cur_blkno] = [(start, i)] + else: + group_dict[cur_blkno].append((start, i)) + + start = i + cur_blkno = blknos[i] + + if cur_blkno not in group_dict: + group_order.append(cur_blkno) + group_dict[cur_blkno] = [(start, n)] + else: + group_dict[cur_blkno].append((start, n)) + + for blkno in group_order: + slices = group_dict[blkno] + if len(slices) == 1: + yield blkno, slice(slices[0][0], slices[0][1]) + else: + tot_len = sum(stop - start for start, stop in slices) + result = np.empty(tot_len, dtype=np.int64) + res_view = result + + i = 0 + for start, stop in slices: + for diff in range(start, stop): + res_view[i] = diff + i += 1 + + yield blkno, result + + +@cython.boundscheck(False) +@cython.wraparound(False) +cpdef slice indexer_as_slice(int64_t[:] vals): + cdef: + Py_ssize_t i, n, start, stop + int64_t d + + if vals is None: + raise TypeError("vals must be ndarray") + + n = vals.shape[0] + + if n == 0 or vals[0] < 0: + return None + + if n == 1: + return slice(vals[0], vals[0] + 1, 1) + + if vals[1] < 0: + return None + + # n > 2 + d = vals[1] - vals[0] + + if d == 0: + return None + + for i in range(2, n): + if vals[i] < 0 or vals[i] - vals[i-1] != d: + return None + + start = vals[0] + stop = start + n * d + if stop < 0 and d < 0: + return slice(start, None, d) + else: + return slice(start, stop, d) + + +cpdef slice_canonize(slice s): + """ + Convert slice to canonical bounded form. + """ + cdef: + Py_ssize_t start, stop, step, length + + if s.step is None: + step = 1 + else: + step = s.step + if step == 0: + raise ValueError("slice step cannot be zero") + + if step > 0: + if s.stop is None: + raise ValueError("unbounded slice") + + stop = s.stop + if s.start is None: + start = 0 + else: + start = s.start + if start > stop: + start = stop + elif step < 0: + if s.start is None: + raise ValueError("unbounded slice") + + start = s.start + if s.stop is None: + stop = -1 + else: + stop = s.stop + if stop > start: + stop = start + + if start < 0 or (stop < 0 and s.stop is not None): + raise ValueError("unbounded slice") + + if stop < 0: + return slice(start, None, step) + else: + return slice(start, stop, step) + + +cpdef slice_get_indices_ex(slice slc, Py_ssize_t objlen=INT64_MAX): + """ + Get (start, stop, step, length) tuple for a slice. + + If `objlen` is not specified, slice must be bounded, otherwise the result + will be wrong. + + """ + cdef: + Py_ssize_t start, stop, step, length + + if slc is None: + raise TypeError("slc should be a slice") + + PySlice_GetIndicesEx(slc, objlen, + &start, &stop, &step, &length) + return start, stop, step, length + + +cpdef Py_ssize_t slice_len(slice slc, Py_ssize_t objlen=INT64_MAX) except -1: + """ + Get length of a bounded slice. + + The slice must not have any "open" bounds that would create dependency on + container size, i.e.: + - if ``s.step is None or s.step > 0``, ``s.stop`` is not ``None`` + - if ``s.step < 0``, ``s.start`` is not ``None`` + + Otherwise, the result is unreliable. + + """ + cdef: + Py_ssize_t start, stop, step, length + + if slc is None: + raise TypeError("slc must be slice") + + PySlice_GetIndicesEx(slc, objlen, + &start, &stop, &step, &length) + + return length + + +def slice_getitem(slice slc not None, ind): + cdef: + Py_ssize_t s_start, s_stop, s_step, s_len + Py_ssize_t ind_start, ind_stop, ind_step, ind_len + + s_start, s_stop, s_step, s_len = slice_get_indices_ex(slc) + + if isinstance(ind, slice): + ind_start, ind_stop, ind_step, ind_len = slice_get_indices_ex(ind, + s_len) + + if ind_step > 0 and ind_len == s_len: + # short-cut for no-op slice + if ind_len == s_len: + return slc + + if ind_step < 0: + s_start = s_stop - s_step + ind_step = -ind_step + + s_step *= ind_step + s_stop = s_start + ind_stop * s_step + s_start = s_start + ind_start * s_step + + if s_step < 0 and s_stop < 0: + return slice(s_start, None, s_step) + else: + return slice(s_start, s_stop, s_step) + + else: + return np.arange(s_start, s_stop, s_step)[ind] + + +cdef class BlockPlacement: + # __slots__ = '_as_slice', '_as_array', '_len' + cdef slice _as_slice + cdef object _as_array + + cdef bint _has_slice, _has_array, _is_known_slice_like + + def __init__(self, val): + cdef slice slc + + self._has_slice = False + self._has_array = False + + if isinstance(val, slice): + slc = slice_canonize(val) + + if slc.start != slc.stop: + self._as_slice = slc + self._has_slice = True + else: + arr = np.empty(0, dtype=np.int64) + self._as_array = arr + self._has_array = True + else: + # Cython memoryview interface requires ndarray to be writeable. + arr = np.require(val, dtype=np.int64, requirements='W') + assert arr.ndim == 1 + self._as_array = arr + self._has_array = True + + def __unicode__(self): + cdef slice s = self._ensure_has_slice() + if s is not None: + v = self._as_slice + else: + v = self._as_array + + return '%s(%r)' % (self.__class__.__name__, v) + + def __len__(self): + cdef slice s = self._ensure_has_slice() + if s is not None: + return slice_len(s) + else: + return len(self._as_array) + + def __iter__(self): + cdef slice s = self._ensure_has_slice() + cdef Py_ssize_t start, stop, step, _ + if s is not None: + start, stop, step, _ = slice_get_indices_ex(s) + return iter(range(start, stop, step)) + else: + return iter(self._as_array) + + @property + def as_slice(self): + cdef slice s = self._ensure_has_slice() + if s is None: + raise TypeError('Not slice-like') + else: + return s + + @property + def indexer(self): + cdef slice s = self._ensure_has_slice() + if s is not None: + return s + else: + return self._as_array + + def isin(self, arr): + from pandas.core.index import Int64Index + return Int64Index(self.as_array, copy=False).isin(arr) + + @property + def as_array(self): + cdef Py_ssize_t start, stop, end, _ + if not self._has_array: + start, stop, step, _ = slice_get_indices_ex(self._as_slice) + self._as_array = np.arange(start, stop, step, + dtype=np.int_) + self._has_array = True + return self._as_array + + @property + def is_slice_like(self): + cdef slice s = self._ensure_has_slice() + return s is not None + + def __getitem__(self, loc): + cdef slice s = self._ensure_has_slice() + if s is not None: + val = slice_getitem(s, loc) + else: + val = self._as_array[loc] + + if not isinstance(val, slice) and val.ndim == 0: + return val + + return BlockPlacement(val) + + def delete(self, loc): + return BlockPlacement(np.delete(self.as_array, loc, axis=0)) + + def append(self, others): + if len(others) == 0: + return self + + return BlockPlacement(np.concatenate([self.as_array] + + [o.as_array for o in others])) + + cdef iadd(self, other): + cdef slice s = self._ensure_has_slice() + cdef Py_ssize_t other_int, start, stop, step, l + + if isinstance(other, int) and s is not None: + other_int = other + + if other_int == 0: + return self + + start, stop, step, l = slice_get_indices_ex(s) + start += other_int + stop += other_int + + if ((step > 0 and start < 0) or + (step < 0 and stop < step)): + raise ValueError("iadd causes length change") + + if stop < 0: + self._as_slice = slice(start, None, step) + else: + self._as_slice = slice(start, stop, step) + + self._has_array = False + self._as_array = None + else: + newarr = self.as_array + other + if (newarr < 0).any(): + raise ValueError("iadd causes length change") + + self._as_array = newarr + self._has_array = True + self._has_slice = False + self._as_slice = None + + return self + + cdef BlockPlacement copy(self): + cdef slice s = self._ensure_has_slice() + if s is not None: + return BlockPlacement(s) + else: + return BlockPlacement(self._as_array) + + def add(self, other): + return self.copy().iadd(other) + + def sub(self, other): + return self.add(-other) + + cdef slice _ensure_has_slice(self): + if not self._has_slice: + self._as_slice = indexer_as_slice(self._as_array) + self._has_slice = True + return self._as_slice + + include "reduce.pyx" include "properties.pyx" include "inference.pyx" diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 5d0aa992b9407..2aac364d16770 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -11822,8 +11822,8 @@ def test_columns_with_dups(self): df_dt = DataFrame(Timestamp('20010101'),index=df_float.index,columns=df_float.columns) df = pd.concat([ df_float, df_int, df_bool, df_object, df_dt ], axis=1) - result = df._data._ref_locs - self.assertEqual(len(result), len(df.columns)) + self.assertEqual(len(df._data._blknos), len(df.columns)) + self.assertEqual(len(df._data._blklocs), len(df.columns)) # testing iget for i in range(len(df.columns)): diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 261e1dd2a590c..a105b17795398 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -1015,7 +1015,7 @@ def test_iloc_getitem_doc_issue(self): columns = list(range(0,8,2)) df = DataFrame(arr,index=index,columns=columns) - df._data.blocks[0].ref_locs + df._data.blocks[0].mgr_locs result = df.iloc[1:5,2:4] str(result) result.dtypes diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index 1e4c621dd1683..b91384a840c33 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -18,58 +18,85 @@ def assert_block_equal(left, right): assert_almost_equal(left.values, right.values) assert(left.dtype == right.dtype) - assert_almost_equal(left.ref_locs, right.ref_locs) + assert_almost_equal(left.mgr_locs, right.mgr_locs) -def get_numeric_mat(n, k, dtype): - return np.repeat(np.atleast_2d(np.arange(k, dtype=dtype)), n, axis=0) +def get_numeric_mat(shape): + arr = np.arange(shape[0]) + return np.lib.stride_tricks.as_strided( + x=arr, shape=shape, + strides=(arr.itemsize,) + (0,) * (len(shape) - 1)).copy() N = 10 -def create_block(typestr, placement, num_rows=None, num_offset=None): - placement = np.asanyarray(placement) +def create_block(typestr, placement, item_shape=None, num_offset=0): + """ + Supported typestr: + + * float, f8, f4, f2 + * int, i8, i4, i2, i1 + * uint, u8, u4, u2, u1 + * complex, c16, c8 + * bool + * object, string, O + * datetime, dt + * sparse (SparseArray with fill_value=0.0) + * sparse_na (SparseArray with fill_value=np.nan) - if num_offset is None: - num_offset = 0 + """ + placement = BlockPlacement(placement) + num_items = len(placement) - if num_rows is None: - num_rows = N + if item_shape is None: + item_shape = (N,) + + shape = (num_items,) + item_shape + + mat = get_numeric_mat(shape) if typestr in ('float', 'f8', 'f4', 'f2', 'int', 'i8', 'i4', 'i2', 'i1', 'uint', 'u8', 'u4', 'u2', 'u1'): - values = get_numeric_mat(num_rows, len(placement), - dtype=np.dtype(typestr)).T + num_offset + values = mat.astype(typestr) + num_offset elif typestr in ('complex', 'c16', 'c8'): - values = get_numeric_mat(num_rows, len(placement), - dtype=np.dtype(typestr)).T + num_offset - values *= 1.j + values = 1.j * (mat.astype(typestr) + num_offset) elif typestr in ('object', 'string', 'O'): - values = np.repeat( - np.array([['A%s' % i - for i in np.arange(len(placement)) + num_offset]]), - num_rows, axis=0).T + values = np.reshape(['A%d' % i for i in mat.ravel() + num_offset], + shape) elif typestr in ('bool'): - values = np.ones((num_rows, len(placement)), dtype=np.bool_).T + values = np.ones(shape, dtype=np.bool_) elif typestr in ('datetime', 'dt'): - values = (randn(num_rows, len(placement)).astype(int) - .astype('M8[ns]')).T - elif typestr in ('sparse',): + values = (mat * 1e9).astype('M8[ns]') + elif typestr in ('sparse', 'sparse_na'): # FIXME: doesn't support num_rows != 10 - assert len(placement) == 1 - assert num_rows == 10 - values = SparseArray([0, 0, 1, 2, 3, 0, 4, 5, 0, 6], fill_value=0) + assert shape[-1] == 10 + assert all(s == 1 for s in shape[:-1]) + if typestr.endswith('_na'): + fill_value = np.nan + else: + fill_value = 0.0 + values = SparseArray([fill_value, fill_value, 1, 2, 3, fill_value, + 4, 5, fill_value, 6], fill_value=fill_value) arr = values.sp_values.view() arr += (num_offset - 1) else: raise ValueError('Unsupported typestr: "%s"' % typestr) - return make_block(values, placement=placement) + return make_block(values, placement=placement, ndim=len(shape)) + + +def create_single_mgr(typestr, num_rows=None): + if num_rows is None: + num_rows = N + + return SingleBlockManager( + create_block(typestr, placement=slice(0, num_rows), item_shape=()), + np.arange(num_rows)) -def create_mgr(descr, num_rows=None): +def create_mgr(descr, item_shape=None): """ Construct BlockManager from string description. @@ -80,17 +107,7 @@ def create_mgr(descr, num_rows=None): Rules are rather simple: - * supported datatypes: - - * float, f8, f4, f2 - * int, i8, i4, i2, i1 - * uint, u8, u4, u2, u1 - * complex, c16, c8 - * bool - * object, string, O - * datetime, dt - * sparse - + * see list of supported datatypes in `create_block` method * components are semicolon-separated * each component is `NAME,NAME,NAME: DTYPE_ID` * whitespace around colons & semicolons are removed @@ -100,8 +117,8 @@ def create_mgr(descr, num_rows=None): 'a:f8-1; b:f8-2; c:f8-foobar' """ - if num_rows is None: - num_rows = N + if item_shape is None: + item_shape = (N,) offset = 0 mgr_items = [] @@ -126,12 +143,12 @@ def create_mgr(descr, num_rows=None): num_offset = 0 for blockstr, placement in block_placements.items(): typestr = blockstr.split('-')[0] - blocks.append(create_block(typestr, placement, num_rows=num_rows, + blocks.append(create_block(typestr, placement, item_shape=item_shape, num_offset=num_offset,)) num_offset += len(placement) - return BlockManager(sorted(blocks, key=lambda b: b.ref_locs[0]), - [mgr_items, np.arange(num_rows)]) + return BlockManager(sorted(blocks, key=lambda b: b.mgr_locs[0]), + [mgr_items] + [np.arange(n) for n in item_shape]) @@ -169,8 +186,8 @@ def _check(blk): _check(self.oblock) _check(self.bool_block) - def test_ref_locs(self): - assert_almost_equal(self.fblock.ref_locs, [0, 2, 4]) + def test_mgr_locs(self): + assert_almost_equal(self.fblock.mgr_locs, [0, 2, 4]) def test_attrs(self): self.assertEqual(self.fblock.shape, self.fblock.values.shape) @@ -188,7 +205,7 @@ def test_merge(self): bblock = make_block(bvals, ref_cols.get_indexer(['a', 'd'])) merged = ablock.merge(bblock) - assert_almost_equal(merged.ref_locs, [0, 1, 2, 3]) + assert_almost_equal(merged.mgr_locs, [0, 1, 2, 3]) assert_almost_equal(merged.values[[0, 2]], avals) assert_almost_equal(merged.values[[1, 3]], bvals) @@ -199,33 +216,9 @@ def test_copy(self): self.assertIsNot(cop, self.fblock) assert_block_equal(self.fblock, cop) - def test_items(self): - raise nose.SkipTest('items are removed from Block') - cols = self.fblock.items - self.assert_numpy_array_equal(cols, ['a', 'c', 'e']) - - cols2 = self.fblock.items - # disabled: items are generated - # self.assertIs(cols, cols2) - - def test_assign_ref_items(self): - raise nose.SkipTest('ref_items are removed from Block') - new_cols = Index(['foo', 'bar', 'baz', 'quux', 'hi']) - self.fblock.set_ref_items(new_cols) - self.assert_numpy_array_equal(self.fblock.items, ['foo', 'baz', 'hi']) - def test_reindex_index(self): pass - def test_reindex_items_from(self): - raise nose.SkipTest('reindex_items_from is removed from Block') - new_cols = Index(['e', 'b', 'c', 'f']) - reindexed = self.fblock.reindex_items_from(new_cols) - assert_almost_equal(reindexed.ref_locs, [0, 2]) - self.assertEquals(reindexed.values.shape[0], 2) - self.assert_((reindexed.values[0] == 2).all()) - self.assert_((reindexed.values[1] == 1).all()) - def test_reindex_cast(self): pass @@ -233,19 +226,23 @@ def test_insert(self): pass def test_delete(self): - newb = self.fblock.delete(0) - assert_almost_equal(newb.ref_locs, [2, 4]) + newb = self.fblock.copy() + newb.delete(0) + assert_almost_equal(newb.mgr_locs, [2, 4]) self.assert_((newb.values[0] == 1).all()) - newb = self.fblock.delete(1) - assert_almost_equal(newb.ref_locs, [0, 4]) + newb = self.fblock.copy() + newb.delete(1) + assert_almost_equal(newb.mgr_locs, [0, 4]) self.assert_((newb.values[1] == 2).all()) - newb = self.fblock.delete(2) - assert_almost_equal(newb.ref_locs, [0, 2]) + newb = self.fblock.copy() + newb.delete(2) + assert_almost_equal(newb.mgr_locs, [0, 2]) self.assert_((newb.values[1] == 1).all()) - self.assertRaises(Exception, self.fblock.delete, 3) + newb = self.fblock.copy() + self.assertRaises(Exception, newb.delete, 3) def test_split_block_at(self): @@ -270,11 +267,6 @@ def test_split_block_at(self): bs = list(bblock.split_block_at('f')) self.assertEqual(len(bs), 0) - def test_unicode_repr(self): - raise nose.SkipTest('No items to test unicode on...') - str_repr = repr(create_block('object', [0, 1], - ref_items=['b', u("\u05d0")])) - def test_get(self): pass @@ -326,23 +318,16 @@ def test_duplicate_ref_loc_failure(self): axes, blocks = tmp_mgr.axes, tmp_mgr.blocks - blocks[0]._ref_locs = np.array([0]) - blocks[1]._ref_locs = np.array([0]) + blocks[0].mgr_locs = np.array([0]) + blocks[1].mgr_locs = np.array([0]) # test trying to create block manager with overlapping ref locs self.assertRaises(AssertionError, BlockManager, blocks, axes) - blocks[0]._ref_locs = np.array([0]) - blocks[1]._ref_locs = np.array([1]) + blocks[0].mgr_locs = np.array([0]) + blocks[1].mgr_locs = np.array([1]) mgr = BlockManager(blocks, axes) mgr.iget(1) - # invalidate the _ref_locs - for b in blocks: - b._ref_locs = None - mgr._ref_locs = None - mgr._items_map = None - self.assertRaises(Exception, mgr._rebuild_ref_locs) - def test_contains(self): self.assertIn('a', self.mgr) self.assertNotIn('baz', self.mgr) @@ -386,7 +371,7 @@ def test_get(self): assert_almost_equal(mgr.get('c'), values[2]) def test_set(self): - mgr = create_mgr('a,b,c: int', num_rows=3) + mgr = create_mgr('a,b,c: int', item_shape=(3,)) mgr.set('d', np.array(['foo'] * 3)) mgr.set('b', np.array(['bar'] * 3)) @@ -581,7 +566,7 @@ def test_consolidate_ordering_issues(self): cons = self.mgr.consolidate() self.assertEquals(cons.nblocks, 1) - assert_almost_equal(cons.blocks[0].ref_locs, + assert_almost_equal(cons.blocks[0].mgr_locs, np.arange(len(cons.items))) def test_reindex_index(self): @@ -618,7 +603,7 @@ def test_multiindex_xs(self): def test_get_numeric_data(self): mgr = create_mgr('int: int; float: float; complex: complex;' 'str: object; bool: bool; obj: object; dt: datetime', - num_rows=3) + item_shape=(3,)) mgr.set('obj', np.array([1, 2, 3], dtype=np.object_)) numeric = mgr.get_numeric_data() @@ -637,7 +622,7 @@ def test_get_numeric_data(self): def test_get_bool_data(self): mgr = create_mgr('int: int; float: float; complex: complex;' 'str: object; bool: bool; obj: object; dt: datetime', - num_rows=3) + item_shape=(3,)) mgr.set('obj', np.array([True, False, True], dtype=np.object_)) bools = mgr.get_bool_data() @@ -672,7 +657,334 @@ def test_equals(self): bm2 = BlockManager(bm1.blocks[::-1], bm1.axes) self.assertTrue(bm1.equals(bm2)) + def test_single_mgr_ctor(self): + mgr = create_single_mgr('f8', num_rows=5) + self.assertEquals(mgr.as_matrix().tolist(), [0., 1., 2., 3., 4.]) + + +class TestIndexing(object): + # Nosetests-style data-driven tests. + # + # This test applies different indexing routines to block managers and + # compares the outcome to the result of same operations on np.ndarray. + # + # NOTE: sparse (SparseBlock with fill_value != np.nan) fail a lot of tests + # and are disabled. + + MANAGERS = [ + create_single_mgr('f8', N), + create_single_mgr('i8', N), + #create_single_mgr('sparse', N), + create_single_mgr('sparse_na', N), + + # 2-dim + create_mgr('a,b,c,d,e,f: f8', item_shape=(N,)), + create_mgr('a,b,c,d,e,f: i8', item_shape=(N,)), + create_mgr('a,b: f8; c,d: i8; e,f: string', item_shape=(N,)), + create_mgr('a,b: f8; c,d: i8; e,f: f8', item_shape=(N,)), + #create_mgr('a: sparse', item_shape=(N,)), + create_mgr('a: sparse_na', item_shape=(N,)), + + # 3-dim + create_mgr('a,b,c,d,e,f: f8', item_shape=(N, N)), + create_mgr('a,b,c,d,e,f: i8', item_shape=(N, N)), + create_mgr('a,b: f8; c,d: i8; e,f: string', item_shape=(N, N)), + create_mgr('a,b: f8; c,d: i8; e,f: f8', item_shape=(N, N)), + # create_mgr('a: sparse', item_shape=(1, N)), + ] + + # MANAGERS = [MANAGERS[6]] + + def test_get_slice(self): + def assert_slice_ok(mgr, axis, slobj): + # import pudb; pudb.set_trace() + mat = mgr.as_matrix() + sliced = mgr.get_slice(slobj, axis=axis) + mat_slobj = (slice(None),) * axis + (slobj,) + assert_almost_equal(mat[mat_slobj], sliced.as_matrix()) + assert_almost_equal(mgr.axes[axis][slobj], sliced.axes[axis]) + + for mgr in self.MANAGERS: + for ax in range(mgr.ndim): + # slice + yield assert_slice_ok, mgr, ax, slice(None) + yield assert_slice_ok, mgr, ax, slice(3) + yield assert_slice_ok, mgr, ax, slice(100) + yield assert_slice_ok, mgr, ax, slice(1, 4) + yield assert_slice_ok, mgr, ax, slice(3, 0, -2) + + # boolean mask + yield assert_slice_ok, mgr, ax, np.array([], dtype=np.bool_) + yield (assert_slice_ok, mgr, ax, + np.ones(mgr.shape[ax], dtype=np.bool_)) + yield (assert_slice_ok, mgr, ax, + np.zeros(mgr.shape[ax], dtype=np.bool_)) + + if mgr.shape[ax] >= 3: + yield (assert_slice_ok, mgr, ax, + np.arange(mgr.shape[ax]) % 3 == 0) + yield (assert_slice_ok, mgr, ax, + np.array([True, True, False], dtype=np.bool_)) + + # fancy indexer + yield assert_slice_ok, mgr, ax, [] + yield assert_slice_ok, mgr, ax, lrange(mgr.shape[ax]) + + if mgr.shape[ax] >= 3: + yield assert_slice_ok, mgr, ax, [0, 1, 2] + yield assert_slice_ok, mgr, ax, [-1, -2, -3] + + def test_take(self): + def assert_take_ok(mgr, axis, indexer): + mat = mgr.as_matrix() + taken = mgr.take(indexer, axis) + assert_almost_equal(np.take(mat, indexer, axis), + taken.as_matrix()) + assert_almost_equal(mgr.axes[axis].take(indexer), + taken.axes[axis]) + + for mgr in self.MANAGERS: + for ax in range(mgr.ndim): + # take/fancy indexer + yield assert_take_ok, mgr, ax, [] + yield assert_take_ok, mgr, ax, [0, 0, 0] + yield assert_take_ok, mgr, ax, lrange(mgr.shape[ax]) + + if mgr.shape[ax] >= 3: + yield assert_take_ok, mgr, ax, [0, 1, 2] + yield assert_take_ok, mgr, ax, [-1, -2, -3] + + def test_reindex_axis(self): + def assert_reindex_axis_is_ok(mgr, axis, new_labels, + fill_value): + mat = mgr.as_matrix() + indexer = mgr.axes[axis].get_indexer_for(new_labels) + + reindexed = mgr.reindex_axis(new_labels, axis, + fill_value=fill_value) + assert_almost_equal(com.take_nd(mat, indexer, axis, + fill_value=fill_value), + reindexed.as_matrix()) + assert_almost_equal(reindexed.axes[axis], new_labels) + + for mgr in self.MANAGERS: + for ax in range(mgr.ndim): + for fill_value in (None, np.nan, 100.): + yield assert_reindex_axis_is_ok, mgr, ax, [], fill_value + yield (assert_reindex_axis_is_ok, mgr, ax, + mgr.axes[ax], fill_value) + yield (assert_reindex_axis_is_ok, mgr, ax, + mgr.axes[ax][[0, 0, 0]], fill_value) + yield (assert_reindex_axis_is_ok, mgr, ax, + ['foo', 'bar', 'baz'], fill_value) + yield (assert_reindex_axis_is_ok, mgr, ax, + ['foo', mgr.axes[ax][0], 'baz'], fill_value) + + if mgr.shape[ax] >= 3: + yield (assert_reindex_axis_is_ok, mgr, ax, + mgr.axes[ax][:-3], fill_value) + yield (assert_reindex_axis_is_ok, mgr, ax, + mgr.axes[ax][-3::-1], fill_value) + yield (assert_reindex_axis_is_ok, mgr, ax, + mgr.axes[ax][[0, 1, 2, 0, 1, 2]], fill_value) + + def test_reindex_indexer(self): + def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, + fill_value): + mat = mgr.as_matrix() + reindexed_mat = com.take_nd(mat, indexer, axis, + fill_value=fill_value) + reindexed = mgr.reindex_indexer(new_labels, indexer, axis, + fill_value=fill_value) + assert_almost_equal(reindexed_mat, reindexed.as_matrix()) + assert_almost_equal(reindexed.axes[axis], new_labels) + + for mgr in self.MANAGERS: + for ax in range(mgr.ndim): + for fill_value in (None, np.nan, 100.): + yield (assert_reindex_indexer_is_ok, mgr, ax, + [], [], fill_value) + yield (assert_reindex_indexer_is_ok, mgr, ax, + mgr.axes[ax], np.arange(mgr.shape[ax]), fill_value) + yield (assert_reindex_indexer_is_ok, mgr, ax, + ['foo'] * mgr.shape[ax], np.arange(mgr.shape[ax]), + fill_value) + + yield (assert_reindex_indexer_is_ok, mgr, ax, + mgr.axes[ax][::-1], np.arange(mgr.shape[ax]), + fill_value) + yield (assert_reindex_indexer_is_ok, mgr, ax, + mgr.axes[ax], np.arange(mgr.shape[ax])[::-1], + fill_value) + yield (assert_reindex_indexer_is_ok, mgr, ax, + ['foo', 'bar', 'baz'], [0, 0, 0], fill_value) + yield (assert_reindex_indexer_is_ok, mgr, ax, + ['foo', 'bar', 'baz'], [-1, 0, -1], fill_value) + yield (assert_reindex_indexer_is_ok, mgr, ax, + ['foo', mgr.axes[ax][0], 'baz'], [-1, -1, -1], + fill_value) + + if mgr.shape[ax] >= 3: + yield (assert_reindex_indexer_is_ok, mgr, ax, + ['foo', 'bar', 'baz'], [0, 1, 2], fill_value) + + + # test_get_slice(slice_like, axis) + # take(indexer, axis) + # reindex_axis(new_labels, axis) + # reindex_indexer(new_labels, indexer, axis) + + + + +class TestBlockPlacement(tm.TestCase): + _multiprocess_can_split_ = True + + def test_slice_len(self): + self.assertEquals(len(BlockPlacement(slice(0, 4))), 4) + self.assertEquals(len(BlockPlacement(slice(0, 4, 2))), 2) + self.assertEquals(len(BlockPlacement(slice(0, 3, 2))), 2) + + self.assertEquals(len(BlockPlacement(slice(0, 1, 2))), 1) + self.assertEquals(len(BlockPlacement(slice(1, 0, -1))), 1) + + def test_zero_step_raises(self): + self.assertRaises(ValueError, BlockPlacement, slice(1, 1, 0)) + self.assertRaises(ValueError, BlockPlacement, slice(1, 2, 0)) + + def test_unbounded_slice_raises(self): + def assert_unbounded_slice_error(slc): + # assertRaisesRegexp is not available in py2.6 + # self.assertRaisesRegexp(ValueError, "unbounded slice", + # lambda: BlockPlacement(slc)) + self.assertRaises(ValueError, BlockPlacement, slc) + + assert_unbounded_slice_error(slice(None, None)) + assert_unbounded_slice_error(slice(10, None)) + assert_unbounded_slice_error(slice(None, None, -1)) + assert_unbounded_slice_error(slice(None, 10, -1)) + + # These are "unbounded" because negative index will change depending on + # container shape. + assert_unbounded_slice_error(slice(-1, None)) + assert_unbounded_slice_error(slice(None, -1)) + assert_unbounded_slice_error(slice(-1, -1)) + assert_unbounded_slice_error(slice(-1, None, -1)) + assert_unbounded_slice_error(slice(None, -1, -1)) + assert_unbounded_slice_error(slice(-1, -1, -1)) + + def test_not_slice_like_slices(self): + def assert_not_slice_like(slc): + self.assertTrue(not BlockPlacement(slc).is_slice_like) + + assert_not_slice_like(slice(0, 0)) + assert_not_slice_like(slice(100, 0)) + + assert_not_slice_like(slice(100, 100, -1)) + assert_not_slice_like(slice(0, 100, -1)) + + self.assertTrue(not BlockPlacement(slice(0, 0)).is_slice_like) + self.assertTrue(not BlockPlacement(slice(100, 100)).is_slice_like) + + def test_array_to_slice_conversion(self): + def assert_as_slice_equals(arr, slc): + self.assertEquals(BlockPlacement(arr).as_slice, slc) + + assert_as_slice_equals([0], slice(0, 1, 1)) + assert_as_slice_equals([100], slice(100, 101, 1)) + + assert_as_slice_equals([0, 1, 2], slice(0, 3, 1)) + assert_as_slice_equals([0, 5, 10], slice(0, 15, 5)) + assert_as_slice_equals([0, 100], slice(0, 200, 100)) + + assert_as_slice_equals([2, 1], slice(2, 0, -1)) + assert_as_slice_equals([2, 1, 0], slice(2, None, -1)) + assert_as_slice_equals([100, 0], slice(100, None, -100)) + + def test_not_slice_like_arrays(self): + def assert_not_slice_like(arr): + self.assertTrue(not BlockPlacement(arr).is_slice_like) + + assert_not_slice_like([]) + assert_not_slice_like([-1]) + assert_not_slice_like([-1, -2, -3]) + assert_not_slice_like([-10]) + assert_not_slice_like([-1]) + assert_not_slice_like([-1, 0, 1, 2]) + assert_not_slice_like([-2, 0, 2, 4]) + assert_not_slice_like([1, 0, -1]) + assert_not_slice_like([1, 1, 1]) + + def test_slice_iter(self): + self.assertEquals(list(BlockPlacement(slice(0, 3))), [0, 1, 2]) + self.assertEquals(list(BlockPlacement(slice(0, 0))), []) + self.assertEquals(list(BlockPlacement(slice(3, 0))), []) + + self.assertEquals(list(BlockPlacement(slice(3, 0, -1))), [3, 2, 1]) + self.assertEquals(list(BlockPlacement(slice(3, None, -1))), + [3, 2, 1, 0]) + + def test_slice_to_array_conversion(self): + def assert_as_array_equals(slc, asarray): + np.testing.assert_array_equal( + BlockPlacement(slc).as_array, + np.asarray(asarray)) + + assert_as_array_equals(slice(0, 3), [0, 1, 2]) + assert_as_array_equals(slice(0, 0), []) + assert_as_array_equals(slice(3, 0), []) + + assert_as_array_equals(slice(3, 0, -1), [3, 2, 1]) + assert_as_array_equals(slice(3, None, -1), [3, 2, 1, 0]) + assert_as_array_equals(slice(31, None, -10), [31, 21, 11, 1]) + + def test_blockplacement_add(self): + bpl = BlockPlacement(slice(0, 5)) + self.assertEquals(bpl.add(1).as_slice, slice(1, 6, 1)) + self.assertEquals(bpl.add(np.arange(5)).as_slice, + slice(0, 10, 2)) + self.assertEquals(list(bpl.add(np.arange(5, 0, -1))), + [5, 5, 5, 5, 5]) + + def test_blockplacement_add_int(self): + def assert_add_equals(val, inc, result): + self.assertEquals(list(BlockPlacement(val).add(inc)), + result) + + assert_add_equals(slice(0, 0), 0, []) + assert_add_equals(slice(1, 4), 0, [1, 2, 3]) + assert_add_equals(slice(3, 0, -1), 0, [3, 2, 1]) + assert_add_equals(slice(2, None, -1), 0, [2, 1, 0]) + assert_add_equals([1, 2, 4], 0, [1, 2, 4]) + + assert_add_equals(slice(0, 0), 10, []) + assert_add_equals(slice(1, 4), 10, [11, 12, 13]) + assert_add_equals(slice(3, 0, -1), 10, [13, 12, 11]) + assert_add_equals(slice(2, None, -1), 10, [12, 11, 10]) + assert_add_equals([1, 2, 4], 10, [11, 12, 14]) + + assert_add_equals(slice(0, 0), -1, []) + assert_add_equals(slice(1, 4), -1, [0, 1, 2]) + assert_add_equals(slice(3, 0, -1), -1, [2, 1, 0]) + assert_add_equals([1, 2, 4], -1, [0, 1, 3]) + + self.assertRaises(ValueError, + lambda: BlockPlacement(slice(1, 4)).add(-10)) + self.assertRaises(ValueError, + lambda: BlockPlacement([1, 2, 4]).add(-10)) + self.assertRaises(ValueError, + lambda: BlockPlacement(slice(2, None, -1)).add(-1)) + + # def test_blockplacement_array_add(self): + + # assert_add_equals(slice(0, 2), [0, 1, 1], [0, 2, 3]) + # assert_add_equals(slice(2, None, -1), [1, 1, 0], [3, 2, 0]) + + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) + + + diff --git a/vb_suite/eval.py b/vb_suite/eval.py index 3b0efa9e88f48..36aa702b5602a 100644 --- a/vb_suite/eval.py +++ b/vb_suite/eval.py @@ -55,7 +55,7 @@ start_date=datetime(2013, 7, 26)) eval_frame_mult_python = \ - Benchmark("pdl.eval('df * df2 * df3 * df4', engine='python')", + Benchmark("pd.eval('df * df2 * df3 * df4', engine='python')", common_setup, name='eval_frame_mult_python', start_date=datetime(2013, 7, 21)) @@ -102,7 +102,7 @@ name='eval_frame_chained_cmp_one_thread', start_date=datetime(2013, 7, 26)) -setup = common_setup +# setup = common_setup eval_frame_chained_cmp_python = \ Benchmark("pd.eval('df < df2 < df3 < df4', engine='python')", common_setup, name='eval_frame_chained_cmp_python',