Skip to content

Commit a51835b

Browse files
POC: ArrayManager -- array-based data manager for columnar store
1 parent 497ede8 commit a51835b

File tree

5 files changed

+590
-9
lines changed

5 files changed

+590
-9
lines changed

pandas/core/frame.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@
128128
from pandas.core.indexes.multi import MultiIndex, maybe_droplevels
129129
from pandas.core.indexes.period import PeriodIndex
130130
from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable
131-
from pandas.core.internals import BlockManager
131+
from pandas.core.internals import ArrayManager, BlockManager
132132
from pandas.core.internals.construction import (
133133
arrays_to_mgr,
134134
dataclasses_to_dicts,
@@ -446,6 +446,7 @@ def __init__(
446446
columns: Optional[Axes] = None,
447447
dtype: Optional[Dtype] = None,
448448
copy: bool = False,
449+
manager: str = "array",
449450
):
450451
if data is None:
451452
data = {}
@@ -455,7 +456,7 @@ def __init__(
455456
if isinstance(data, DataFrame):
456457
data = data._mgr
457458

458-
if isinstance(data, BlockManager):
459+
if isinstance(data, (BlockManager, ArrayManager)):
459460
if index is None and columns is None and dtype is None and copy is False:
460461
# GH#33357 fastpath
461462
NDFrame.__init__(
@@ -564,6 +565,11 @@ def __init__(
564565
values, index, columns, dtype=values.dtype, copy=False
565566
)
566567

568+
if manager == "array" and not isinstance(mgr, ArrayManager):
569+
# TODO proper initialization
570+
df = DataFrame(mgr, manager="block")
571+
arrays = [arr.copy() for arr in df._iter_column_arrays()]
572+
mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]])
567573
NDFrame.__init__(self, mgr)
568574

569575
# ----------------------------------------------------------------------
@@ -638,6 +644,8 @@ def _is_homogeneous_type(self) -> bool:
638644
... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type
639645
False
640646
"""
647+
if isinstance(self._mgr, ArrayManager):
648+
return False
641649
if self._mgr.any_extension_types:
642650
return len({block.dtype for block in self._mgr.blocks}) == 1
643651
else:

pandas/core/generic.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@
100100
from pandas.core.indexes.datetimes import DatetimeIndex
101101
from pandas.core.indexes.period import Period, PeriodIndex
102102
import pandas.core.indexing as indexing
103-
from pandas.core.internals import BlockManager
103+
from pandas.core.internals import ArrayManager, BlockManager
104104
from pandas.core.missing import find_valid_index
105105
from pandas.core.ops import _align_method_FRAME
106106
from pandas.core.shared_docs import _shared_docs
@@ -197,7 +197,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin):
197197
_deprecations: FrozenSet[str] = frozenset(["get_values", "tshift"])
198198
_metadata: List[str] = []
199199
_is_copy = None
200-
_mgr: BlockManager
200+
_mgr: Union[BlockManager, ArrayManager]
201201
_attrs: Dict[Optional[Hashable], Any]
202202
_typ: str
203203

@@ -206,7 +206,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin):
206206

207207
def __init__(
208208
self,
209-
data: BlockManager,
209+
data: Union[BlockManager, ArrayManager],
210210
copy: bool = False,
211211
attrs: Optional[Mapping[Optional[Hashable], Any]] = None,
212212
):
@@ -223,7 +223,9 @@ def __init__(
223223
object.__setattr__(self, "_flags", Flags(self, allows_duplicate_labels=True))
224224

225225
@classmethod
226-
def _init_mgr(cls, mgr, axes, dtype=None, copy: bool = False) -> BlockManager:
226+
def _init_mgr(
227+
cls, mgr, axes, dtype=None, copy: bool = False
228+
) -> Union[BlockManager, ArrayManager]:
227229
""" passed a manager and a axes dict """
228230
for a, axe in axes.items():
229231
if axe is not None:
@@ -5372,6 +5374,8 @@ def _protect_consolidate(self, f):
53725374
Consolidate _mgr -- if the blocks have changed, then clear the
53735375
cache
53745376
"""
5377+
if isinstance(self._mgr, ArrayManager):
5378+
return f()
53755379
blocks_before = len(self._mgr.blocks)
53765380
result = f()
53775381
if len(self._mgr.blocks) != blocks_before:

pandas/core/internals/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from pandas.core.internals.concat import concatenate_block_managers
1717
from pandas.core.internals.managers import (
1818
BlockManager,
19+
ArrayManager,
1920
SingleBlockManager,
2021
create_block_manager_from_arrays,
2122
create_block_manager_from_blocks,

pandas/core/internals/concat.py

+19-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from collections import defaultdict
22
import copy
3+
import itertools
34
from typing import Dict, List
45

56
import numpy as np
@@ -26,7 +27,7 @@
2627
import pandas.core.algorithms as algos
2728
from pandas.core.arrays import ExtensionArray
2829
from pandas.core.internals.blocks import make_block
29-
from pandas.core.internals.managers import BlockManager
30+
from pandas.core.internals.managers import ArrayManager, BlockManager
3031

3132

3233
def concatenate_block_managers(
@@ -46,6 +47,23 @@ def concatenate_block_managers(
4647
-------
4748
BlockManager
4849
"""
50+
# breakpoint()
51+
52+
if isinstance(mgrs_indexers[0][0], ArrayManager):
53+
54+
if concat_axis == 1:
55+
# TODO for now only fastpath without indexers
56+
mgrs = [t[0] for t in mgrs_indexers]
57+
arrays = [
58+
np.concatenate([mgrs[i].arrays[j] for i in range(len(mgrs))])
59+
for j in range(len(mgrs[0].arrays))
60+
]
61+
return ArrayManager(arrays, [axes[1], axes[0]])
62+
elif concat_axis == 0:
63+
mgrs = [t[0] for t in mgrs_indexers]
64+
arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs]))
65+
return ArrayManager(arrays, [axes[1], axes[0]])
66+
4967
concat_plans = [
5068
_get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers
5169
]

0 commit comments

Comments
 (0)