Skip to content

CLN: revisit & simplify Block/BlockManager, remove axes #6745

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 25, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions pandas/core/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -1024,9 +1024,8 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None,

# preallocate data 2d list
self.blocks = self.obj._data.blocks
ncols = sum(len(b.items) for b in self.blocks)
ncols = sum(b.shape[0] for b in self.blocks)
self.data = [None] * ncols
self.column_map = self.obj._data.get_items_map(use_cached=False)

if chunksize is None:
chunksize = (100000 / (len(self.cols) or 1)) or 1
Expand Down Expand Up @@ -1293,10 +1292,9 @@ def _save_chunk(self, start_i, end_i):
float_format=self.float_format,
date_format=self.date_format)

for i, item in enumerate(b.items):

for col_loc, col in zip(b.mgr_locs, d):
# self.data is a preallocated list
self.data[self.column_map[b][i]] = d[i]
self.data[col_loc] = col

ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep,
float_format=self.float_format,
Expand Down
45 changes: 24 additions & 21 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1043,9 +1043,11 @@ def to_panel(self):

new_blocks = []
for block in selfsorted._data.blocks:
newb = block2d_to_blocknd(block.values.T, block.items, shape,
[major_labels, minor_labels],
ref_items=selfsorted.columns)
newb = block2d_to_blocknd(
values=block.values.T,
placement=block.mgr_locs, shape=shape,
labels=[major_labels, minor_labels],
ref_items=selfsorted.columns)
new_blocks.append(newb)

# preserve names, if any
Expand Down Expand Up @@ -1934,7 +1936,9 @@ def _ensure_valid_index(self, value):
raise ValueError('Cannot set a frame with no defined index '
'and a value that cannot be converted to a '
'Series')
self._data.set_axis(1, value.index.copy(), check_axis=False)

self._data = self._data.reindex_axis(value.index.copy(), axis=1,
fill_value=np.nan)

# we are a scalar
# noop
Expand Down Expand Up @@ -2039,7 +2043,11 @@ def _sanitize_column(self, key, value):

@property
def _series(self):
return self._data.get_series_dict()
result = {}
for idx, item in enumerate(self.columns):
result[item] = Series(self._data.iget(idx), index=self.index,
name=item)
return result

def lookup(self, row_labels, col_labels):
"""Label-based "fancy indexing" function for DataFrame.
Expand Down Expand Up @@ -2629,16 +2637,14 @@ def trans(v):
indexer = _nargsort(labels, kind=kind, ascending=ascending,
na_position=na_position)

bm_axis = self._get_block_manager_axis(axis)
new_data = self._data.take(indexer, axis=bm_axis,
convert=False, verify=False)

if inplace:
if axis == 1:
new_data = self._data.reindex_items(
self._data.items[indexer],
copy=False)
elif axis == 0:
new_data = self._data.take(indexer)
self._update_inplace(new_data)
return self._update_inplace(new_data)
else:
return self.take(indexer, axis=axis, convert=False, is_copy=False)
return self._constructor(new_data).__finalize__(self)

def sortlevel(self, level=0, axis=0, ascending=True, inplace=False):
"""
Expand Down Expand Up @@ -2673,16 +2679,13 @@ def sortlevel(self, level=0, axis=0, ascending=True, inplace=False):
else:
return self.take(indexer, axis=axis, convert=False)

bm_axis = self._get_block_manager_axis(axis)
new_data = self._data.take(indexer, axis=bm_axis,
convert=False, verify=False)
if inplace:
if axis == 1:
new_data = self._data.reindex_items(
self._data.items[indexer],
copy=False)
elif axis == 0:
new_data = self._data.take(indexer)
self._update_inplace(new_data)
return self._update_inplace(new_data)
else:
return self.take(indexer, axis=axis, convert=False, is_copy=False)
return self._constructor(new_data).__finalize__(self)

def swaplevel(self, i, j, axis=0):
"""
Expand Down
52 changes: 13 additions & 39 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -565,7 +565,7 @@ def f(x):
f = _get_rename_function(v)

baxis = self._get_block_manager_axis(axis)
result._data = result._data.rename(f, axis=baxis, copy=copy)
result._data = result._data.rename_axis(f, axis=baxis, copy=copy)
result._clear_item_cache()

if inplace:
Expand Down Expand Up @@ -1217,21 +1217,9 @@ def take(self, indices, axis=0, convert=True, is_copy=True):
taken : type of caller
"""

# check/convert indicies here
if convert:
axis = self._get_axis_number(axis)
indices = _maybe_convert_indices(
indices, len(self._get_axis(axis)))

baxis = self._get_block_manager_axis(axis)
if baxis == 0:
labels = self._get_axis(axis)
new_items = labels.take(indices)
new_data = self._data.reindex_axis(new_items, indexer=indices,
axis=baxis)
else:
new_data = self._data.take(indices, axis=baxis)

new_data = self._data.take(indices,
axis=self._get_block_manager_axis(axis),
convert=True, verify=True)
result = self._constructor(new_data).__finalize__(self)

# maybe set copy if we didn't actually change the index
Expand Down Expand Up @@ -1701,7 +1689,7 @@ def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True,
labels, method, level, limit=limit, copy_if_needed=True)
return self._reindex_with_indexers(
{axis: [new_index, indexer]}, method=method, fill_value=fill_value,
limit=limit, copy=copy).__finalize__(self)
limit=limit, copy=copy)

def _reindex_with_indexers(self, reindexers, method=None,
fill_value=np.nan, limit=None, copy=False,
Expand All @@ -1716,30 +1704,16 @@ def _reindex_with_indexers(self, reindexers, method=None,

if index is None:
continue
index = _ensure_index(index)

# reindex the axis
if method is not None:
new_data = new_data.reindex_axis(
index, indexer=indexer, method=method, axis=baxis,
fill_value=fill_value, limit=limit, copy=copy)

elif indexer is not None:
# TODO: speed up on homogeneous DataFrame objects
index = _ensure_index(index)
if indexer is not None:
indexer = com._ensure_int64(indexer)
new_data = new_data.reindex_indexer(index, indexer, axis=baxis,
fill_value=fill_value,
allow_dups=allow_dups)

elif (baxis == 0 and index is not None and
index is not new_data.axes[baxis]):
new_data = new_data.reindex_items(index, copy=copy,
fill_value=fill_value)

elif (baxis > 0 and index is not None and
index is not new_data.axes[baxis]):
new_data = new_data.copy(deep=copy)
new_data.set_axis(baxis, index)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is copy handled in BlockManager.reindex_indexers ?

I don't think this is well tested and most of the time it should prob copy unless identical indexes

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A rule of thumb I try to follow is, yeah, that reindex should copy, unless there's "inplace=True" kwarg somewhere.

But point taken, need to double check that.

# TODO: speed up on homogeneous DataFrame objects
new_data = new_data.reindex_indexer(index, indexer, axis=baxis,
fill_value=fill_value,
allow_dups=allow_dups,
copy=copy)

if copy and new_data is self._data:
new_data = new_data.copy()
Expand Down
44 changes: 15 additions & 29 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2196,10 +2196,10 @@ def _iterate_slices(self):
yield val, slicer(val)

def _cython_agg_general(self, how, numeric_only=True):
new_blocks = self._cython_agg_blocks(how, numeric_only=numeric_only)
return self._wrap_agged_blocks(new_blocks)
new_items, new_blocks = self._cython_agg_blocks(how, numeric_only=numeric_only)
return self._wrap_agged_blocks(new_items, new_blocks)

def _wrap_agged_blocks(self, blocks):
def _wrap_agged_blocks(self, items, blocks):
obj = self._obj_with_exclusions

new_axes = list(obj._data.axes)
Expand All @@ -2210,6 +2210,10 @@ def _wrap_agged_blocks(self, blocks):
else:
new_axes[self.axis] = self.grouper.result_index

# Make sure block manager integrity check passes.
assert new_axes[0].equals(items)
new_axes[0] = items

mgr = BlockManager(blocks, new_axes)

new_obj = type(obj)(mgr)
Expand All @@ -2223,14 +2227,14 @@ def _cython_agg_blocks(self, how, numeric_only=True):

new_blocks = []

if numeric_only:
data = data.get_numeric_data(copy=False)

for block in data.blocks:
values = block.values

is_numeric = is_numeric_dtype(values.dtype)

if numeric_only and not is_numeric:
continue

if is_numeric:
values = com.ensure_float(values)

Expand All @@ -2239,13 +2243,13 @@ def _cython_agg_blocks(self, how, numeric_only=True):
# see if we can cast the block back to the original dtype
result = block._try_cast_result(result)

newb = make_block(result, block.items, block.ref_items)
newb = make_block(result, placement=block.mgr_locs)
new_blocks.append(newb)

if len(new_blocks) == 0:
raise DataError('No numeric types to aggregate')

return new_blocks
return data.items, new_blocks

def _get_data_to_aggregate(self):
obj = self._obj_with_exclusions
Expand Down Expand Up @@ -2837,28 +2841,10 @@ def _wrap_aggregated_output(self, output, names=None):

return result.convert_objects()

def _wrap_agged_blocks(self, blocks):
obj = self._obj_with_exclusions

if self.axis == 0:
agg_labels = obj.columns
else:
agg_labels = obj.index

if sum(len(x.items) for x in blocks) == len(agg_labels):
output_keys = agg_labels
else:
all_items = []
for b in blocks:
all_items.extend(b.items)
output_keys = agg_labels[agg_labels.isin(all_items)]

for blk in blocks:
blk.set_ref_items(output_keys, maybe_rename=False)

def _wrap_agged_blocks(self, items, blocks):
if not self.as_index:
index = np.arange(blocks[0].values.shape[1])
mgr = BlockManager(blocks, [output_keys, index])
mgr = BlockManager(blocks, [items, index])
result = DataFrame(mgr)

group_levels = self.grouper.get_group_levels()
Expand All @@ -2869,7 +2855,7 @@ def _wrap_agged_blocks(self, blocks):
result = result.consolidate()
else:
index = self.grouper.result_index
mgr = BlockManager(blocks, [output_keys, index])
mgr = BlockManager(blocks, [items, index])
result = DataFrame(mgr)

if self.axis == 1:
Expand Down
Loading