Skip to content

Commit 897a53c

Browse files
committed
Merge pull request pandas-dev#6745 from immerrr/refactor-blockmanager
CLN: revisit & simplify Block/BlockManager, remove axes
2 parents 76055a3 + f51235a commit 897a53c

21 files changed

+2844
-2586
lines changed

pandas/core/format.py

+3-5
Original file line numberDiff line numberDiff line change
@@ -1024,9 +1024,8 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None,
10241024

10251025
# preallocate data 2d list
10261026
self.blocks = self.obj._data.blocks
1027-
ncols = sum(len(b.items) for b in self.blocks)
1027+
ncols = sum(b.shape[0] for b in self.blocks)
10281028
self.data = [None] * ncols
1029-
self.column_map = self.obj._data.get_items_map(use_cached=False)
10301029

10311030
if chunksize is None:
10321031
chunksize = (100000 / (len(self.cols) or 1)) or 1
@@ -1293,10 +1292,9 @@ def _save_chunk(self, start_i, end_i):
12931292
float_format=self.float_format,
12941293
date_format=self.date_format)
12951294

1296-
for i, item in enumerate(b.items):
1297-
1295+
for col_loc, col in zip(b.mgr_locs, d):
12981296
# self.data is a preallocated list
1299-
self.data[self.column_map[b][i]] = d[i]
1297+
self.data[col_loc] = col
13001298

13011299
ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep,
13021300
float_format=self.float_format,

pandas/core/frame.py

+24-21
Original file line numberDiff line numberDiff line change
@@ -1043,9 +1043,11 @@ def to_panel(self):
10431043

10441044
new_blocks = []
10451045
for block in selfsorted._data.blocks:
1046-
newb = block2d_to_blocknd(block.values.T, block.items, shape,
1047-
[major_labels, minor_labels],
1048-
ref_items=selfsorted.columns)
1046+
newb = block2d_to_blocknd(
1047+
values=block.values.T,
1048+
placement=block.mgr_locs, shape=shape,
1049+
labels=[major_labels, minor_labels],
1050+
ref_items=selfsorted.columns)
10491051
new_blocks.append(newb)
10501052

10511053
# preserve names, if any
@@ -1934,7 +1936,9 @@ def _ensure_valid_index(self, value):
19341936
raise ValueError('Cannot set a frame with no defined index '
19351937
'and a value that cannot be converted to a '
19361938
'Series')
1937-
self._data.set_axis(1, value.index.copy(), check_axis=False)
1939+
1940+
self._data = self._data.reindex_axis(value.index.copy(), axis=1,
1941+
fill_value=np.nan)
19381942

19391943
# we are a scalar
19401944
# noop
@@ -2039,7 +2043,11 @@ def _sanitize_column(self, key, value):
20392043

20402044
@property
20412045
def _series(self):
2042-
return self._data.get_series_dict()
2046+
result = {}
2047+
for idx, item in enumerate(self.columns):
2048+
result[item] = Series(self._data.iget(idx), index=self.index,
2049+
name=item)
2050+
return result
20432051

20442052
def lookup(self, row_labels, col_labels):
20452053
"""Label-based "fancy indexing" function for DataFrame.
@@ -2629,16 +2637,14 @@ def trans(v):
26292637
indexer = _nargsort(labels, kind=kind, ascending=ascending,
26302638
na_position=na_position)
26312639

2640+
bm_axis = self._get_block_manager_axis(axis)
2641+
new_data = self._data.take(indexer, axis=bm_axis,
2642+
convert=False, verify=False)
2643+
26322644
if inplace:
2633-
if axis == 1:
2634-
new_data = self._data.reindex_items(
2635-
self._data.items[indexer],
2636-
copy=False)
2637-
elif axis == 0:
2638-
new_data = self._data.take(indexer)
2639-
self._update_inplace(new_data)
2645+
return self._update_inplace(new_data)
26402646
else:
2641-
return self.take(indexer, axis=axis, convert=False, is_copy=False)
2647+
return self._constructor(new_data).__finalize__(self)
26422648

26432649
def sortlevel(self, level=0, axis=0, ascending=True, inplace=False):
26442650
"""
@@ -2673,16 +2679,13 @@ def sortlevel(self, level=0, axis=0, ascending=True, inplace=False):
26732679
else:
26742680
return self.take(indexer, axis=axis, convert=False)
26752681

2682+
bm_axis = self._get_block_manager_axis(axis)
2683+
new_data = self._data.take(indexer, axis=bm_axis,
2684+
convert=False, verify=False)
26762685
if inplace:
2677-
if axis == 1:
2678-
new_data = self._data.reindex_items(
2679-
self._data.items[indexer],
2680-
copy=False)
2681-
elif axis == 0:
2682-
new_data = self._data.take(indexer)
2683-
self._update_inplace(new_data)
2686+
return self._update_inplace(new_data)
26842687
else:
2685-
return self.take(indexer, axis=axis, convert=False, is_copy=False)
2688+
return self._constructor(new_data).__finalize__(self)
26862689

26872690
def swaplevel(self, i, j, axis=0):
26882691
"""

pandas/core/generic.py

+13-39
Original file line numberDiff line numberDiff line change
@@ -565,7 +565,7 @@ def f(x):
565565
f = _get_rename_function(v)
566566

567567
baxis = self._get_block_manager_axis(axis)
568-
result._data = result._data.rename(f, axis=baxis, copy=copy)
568+
result._data = result._data.rename_axis(f, axis=baxis, copy=copy)
569569
result._clear_item_cache()
570570

571571
if inplace:
@@ -1217,21 +1217,9 @@ def take(self, indices, axis=0, convert=True, is_copy=True):
12171217
taken : type of caller
12181218
"""
12191219

1220-
# check/convert indicies here
1221-
if convert:
1222-
axis = self._get_axis_number(axis)
1223-
indices = _maybe_convert_indices(
1224-
indices, len(self._get_axis(axis)))
1225-
1226-
baxis = self._get_block_manager_axis(axis)
1227-
if baxis == 0:
1228-
labels = self._get_axis(axis)
1229-
new_items = labels.take(indices)
1230-
new_data = self._data.reindex_axis(new_items, indexer=indices,
1231-
axis=baxis)
1232-
else:
1233-
new_data = self._data.take(indices, axis=baxis)
1234-
1220+
new_data = self._data.take(indices,
1221+
axis=self._get_block_manager_axis(axis),
1222+
convert=True, verify=True)
12351223
result = self._constructor(new_data).__finalize__(self)
12361224

12371225
# maybe set copy if we didn't actually change the index
@@ -1701,7 +1689,7 @@ def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True,
17011689
labels, method, level, limit=limit, copy_if_needed=True)
17021690
return self._reindex_with_indexers(
17031691
{axis: [new_index, indexer]}, method=method, fill_value=fill_value,
1704-
limit=limit, copy=copy).__finalize__(self)
1692+
limit=limit, copy=copy)
17051693

17061694
def _reindex_with_indexers(self, reindexers, method=None,
17071695
fill_value=np.nan, limit=None, copy=False,
@@ -1716,30 +1704,16 @@ def _reindex_with_indexers(self, reindexers, method=None,
17161704

17171705
if index is None:
17181706
continue
1719-
index = _ensure_index(index)
17201707

1721-
# reindex the axis
1722-
if method is not None:
1723-
new_data = new_data.reindex_axis(
1724-
index, indexer=indexer, method=method, axis=baxis,
1725-
fill_value=fill_value, limit=limit, copy=copy)
1726-
1727-
elif indexer is not None:
1728-
# TODO: speed up on homogeneous DataFrame objects
1708+
index = _ensure_index(index)
1709+
if indexer is not None:
17291710
indexer = com._ensure_int64(indexer)
1730-
new_data = new_data.reindex_indexer(index, indexer, axis=baxis,
1731-
fill_value=fill_value,
1732-
allow_dups=allow_dups)
1733-
1734-
elif (baxis == 0 and index is not None and
1735-
index is not new_data.axes[baxis]):
1736-
new_data = new_data.reindex_items(index, copy=copy,
1737-
fill_value=fill_value)
1738-
1739-
elif (baxis > 0 and index is not None and
1740-
index is not new_data.axes[baxis]):
1741-
new_data = new_data.copy(deep=copy)
1742-
new_data.set_axis(baxis, index)
1711+
1712+
# TODO: speed up on homogeneous DataFrame objects
1713+
new_data = new_data.reindex_indexer(index, indexer, axis=baxis,
1714+
fill_value=fill_value,
1715+
allow_dups=allow_dups,
1716+
copy=copy)
17431717

17441718
if copy and new_data is self._data:
17451719
new_data = new_data.copy()

pandas/core/groupby.py

+15-29
Original file line numberDiff line numberDiff line change
@@ -2196,10 +2196,10 @@ def _iterate_slices(self):
21962196
yield val, slicer(val)
21972197

21982198
def _cython_agg_general(self, how, numeric_only=True):
2199-
new_blocks = self._cython_agg_blocks(how, numeric_only=numeric_only)
2200-
return self._wrap_agged_blocks(new_blocks)
2199+
new_items, new_blocks = self._cython_agg_blocks(how, numeric_only=numeric_only)
2200+
return self._wrap_agged_blocks(new_items, new_blocks)
22012201

2202-
def _wrap_agged_blocks(self, blocks):
2202+
def _wrap_agged_blocks(self, items, blocks):
22032203
obj = self._obj_with_exclusions
22042204

22052205
new_axes = list(obj._data.axes)
@@ -2210,6 +2210,10 @@ def _wrap_agged_blocks(self, blocks):
22102210
else:
22112211
new_axes[self.axis] = self.grouper.result_index
22122212

2213+
# Make sure block manager integrity check passes.
2214+
assert new_axes[0].equals(items)
2215+
new_axes[0] = items
2216+
22132217
mgr = BlockManager(blocks, new_axes)
22142218

22152219
new_obj = type(obj)(mgr)
@@ -2223,14 +2227,14 @@ def _cython_agg_blocks(self, how, numeric_only=True):
22232227

22242228
new_blocks = []
22252229

2230+
if numeric_only:
2231+
data = data.get_numeric_data(copy=False)
2232+
22262233
for block in data.blocks:
22272234
values = block.values
22282235

22292236
is_numeric = is_numeric_dtype(values.dtype)
22302237

2231-
if numeric_only and not is_numeric:
2232-
continue
2233-
22342238
if is_numeric:
22352239
values = com.ensure_float(values)
22362240

@@ -2239,13 +2243,13 @@ def _cython_agg_blocks(self, how, numeric_only=True):
22392243
# see if we can cast the block back to the original dtype
22402244
result = block._try_cast_result(result)
22412245

2242-
newb = make_block(result, block.items, block.ref_items)
2246+
newb = make_block(result, placement=block.mgr_locs)
22432247
new_blocks.append(newb)
22442248

22452249
if len(new_blocks) == 0:
22462250
raise DataError('No numeric types to aggregate')
22472251

2248-
return new_blocks
2252+
return data.items, new_blocks
22492253

22502254
def _get_data_to_aggregate(self):
22512255
obj = self._obj_with_exclusions
@@ -2837,28 +2841,10 @@ def _wrap_aggregated_output(self, output, names=None):
28372841

28382842
return result.convert_objects()
28392843

2840-
def _wrap_agged_blocks(self, blocks):
2841-
obj = self._obj_with_exclusions
2842-
2843-
if self.axis == 0:
2844-
agg_labels = obj.columns
2845-
else:
2846-
agg_labels = obj.index
2847-
2848-
if sum(len(x.items) for x in blocks) == len(agg_labels):
2849-
output_keys = agg_labels
2850-
else:
2851-
all_items = []
2852-
for b in blocks:
2853-
all_items.extend(b.items)
2854-
output_keys = agg_labels[agg_labels.isin(all_items)]
2855-
2856-
for blk in blocks:
2857-
blk.set_ref_items(output_keys, maybe_rename=False)
2858-
2844+
def _wrap_agged_blocks(self, items, blocks):
28592845
if not self.as_index:
28602846
index = np.arange(blocks[0].values.shape[1])
2861-
mgr = BlockManager(blocks, [output_keys, index])
2847+
mgr = BlockManager(blocks, [items, index])
28622848
result = DataFrame(mgr)
28632849

28642850
group_levels = self.grouper.get_group_levels()
@@ -2869,7 +2855,7 @@ def _wrap_agged_blocks(self, blocks):
28692855
result = result.consolidate()
28702856
else:
28712857
index = self.grouper.result_index
2872-
mgr = BlockManager(blocks, [output_keys, index])
2858+
mgr = BlockManager(blocks, [items, index])
28732859
result = DataFrame(mgr)
28742860

28752861
if self.axis == 1:

0 commit comments

Comments
 (0)