Skip to content

Commit 618ac6d

Browse files
jorisvandenbosscheluckyvs1
authored andcommitted
POC: ArrayManager -- array-based data manager for columnar store (pandas-dev#36010)
1 parent 08d83e9 commit 618ac6d

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+1389
-72
lines changed

.github/workflows/ci.yml

+19
Original file line numberDiff line numberDiff line change
@@ -132,3 +132,22 @@ jobs:
132132
- name: Upload dev docs
133133
run: rsync -az --delete doc/build/html/ docs@${{ secrets.server_ip }}:/usr/share/nginx/pandas/pandas-docs/dev
134134
if: github.event_name == 'push'
135+
136+
data_manager:
137+
name: Test experimental data manager
138+
runs-on: ubuntu-latest
139+
steps:
140+
141+
- name: Setting conda path
142+
run: echo "${HOME}/miniconda3/bin" >> $GITHUB_PATH
143+
144+
- name: Checkout
145+
uses: actions/checkout@v1
146+
147+
- name: Setup environment and build pandas
148+
run: ci/setup_env.sh
149+
150+
- name: Run tests
151+
run: |
152+
source activate pandas-dev
153+
pytest pandas/tests/frame/methods --array-manager

pandas/_typing.py

+4
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
from pandas.core.generic import NDFrame # noqa: F401
4040
from pandas.core.groupby.generic import DataFrameGroupBy, SeriesGroupBy
4141
from pandas.core.indexes.base import Index
42+
from pandas.core.internals import ArrayManager, BlockManager
4243
from pandas.core.resample import Resampler
4344
from pandas.core.series import Series
4445
from pandas.core.window.rolling import BaseWindow
@@ -159,3 +160,6 @@
159160
ColspaceArgType = Union[
160161
str, int, Sequence[Union[str, int]], Mapping[Hashable, Union[str, int]]
161162
]
163+
164+
# internals
165+
Manager = Union["ArrayManager", "BlockManager"]

pandas/conftest.py

+21
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,19 @@ def pytest_addoption(parser):
7575
action="store_true",
7676
help="Fail if a test is skipped for missing data file.",
7777
)
78+
parser.addoption(
79+
"--array-manager",
80+
"--am",
81+
action="store_true",
82+
help="Use the experimental ArrayManager as default data manager.",
83+
)
84+
85+
86+
def pytest_sessionstart(session):
87+
# Note: we need to set the option here and not in pytest_runtest_setup below
88+
# to ensure this is run before creating fixture data
89+
if session.config.getoption("--array-manager"):
90+
pd.options.mode.data_manager = "array"
7891

7992

8093
def pytest_runtest_setup(item):
@@ -1454,3 +1467,11 @@ def indexer_si(request):
14541467
Parametrize over __setitem__, iloc.__setitem__
14551468
"""
14561469
return request.param
1470+
1471+
1472+
@pytest.fixture
1473+
def using_array_manager(request):
1474+
"""
1475+
Fixture to check if the array manager is being used.
1476+
"""
1477+
return pd.options.mode.data_manager == "array"

pandas/core/config_init.py

+6
Original file line numberDiff line numberDiff line change
@@ -483,6 +483,12 @@ def use_inf_as_na_cb(key):
483483
cf.register_option(
484484
"use_inf_as_null", False, use_inf_as_null_doc, cb=use_inf_as_na_cb
485485
)
486+
cf.register_option(
487+
"data_manager",
488+
"block",
489+
"Internal data manager type",
490+
validator=is_one_of_factory(["block", "array"]),
491+
)
486492

487493
cf.deprecate_option(
488494
"mode.use_inf_as_null", msg=use_inf_as_null_doc, rkey="mode.use_inf_as_na"

pandas/core/frame.py

+38-6
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@
6262
IndexKeyFunc,
6363
IndexLabel,
6464
Level,
65+
Manager,
6566
PythonFuncType,
6667
Renamer,
6768
StorageOptions,
@@ -137,13 +138,14 @@
137138
)
138139
from pandas.core.indexes.multi import MultiIndex, maybe_droplevels
139140
from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable
140-
from pandas.core.internals import BlockManager
141+
from pandas.core.internals import ArrayManager, BlockManager
141142
from pandas.core.internals.construction import (
142143
arrays_to_mgr,
143144
dataclasses_to_dicts,
144145
init_dict,
145146
init_ndarray,
146147
masked_rec_array_to_mgr,
148+
mgr_to_mgr,
147149
nested_data_to_arrays,
148150
reorder_arrays,
149151
sanitize_index,
@@ -523,7 +525,7 @@ def __init__(
523525
if isinstance(data, DataFrame):
524526
data = data._mgr
525527

526-
if isinstance(data, BlockManager):
528+
if isinstance(data, (BlockManager, ArrayManager)):
527529
if index is None and columns is None and dtype is None and copy is False:
528530
# GH#33357 fastpath
529531
NDFrame.__init__(self, data)
@@ -601,8 +603,31 @@ def __init__(
601603
values, index, columns, dtype=values.dtype, copy=False
602604
)
603605

606+
# ensure correct Manager type according to settings
607+
manager = get_option("mode.data_manager")
608+
mgr = mgr_to_mgr(mgr, typ=manager)
609+
604610
NDFrame.__init__(self, mgr)
605611

612+
def _as_manager(self, typ: str) -> DataFrame:
613+
"""
614+
Private helper function to create a DataFrame with specific manager.
615+
616+
Parameters
617+
----------
618+
typ : {"block", "array"}
619+
620+
Returns
621+
-------
622+
DataFrame
623+
New DataFrame using specified manager type. Is not guaranteed
624+
to be a copy or not.
625+
"""
626+
new_mgr: Manager
627+
new_mgr = mgr_to_mgr(self._mgr, typ=typ)
628+
# fastpath of passing a manager doesn't check the option/manager class
629+
return DataFrame(new_mgr)
630+
606631
# ----------------------------------------------------------------------
607632

608633
@property
@@ -675,6 +700,8 @@ def _is_homogeneous_type(self) -> bool:
675700
... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type
676701
False
677702
"""
703+
if isinstance(self._mgr, ArrayManager):
704+
return len({arr.dtype for arr in self._mgr.arrays}) == 1
678705
if self._mgr.any_extension_types:
679706
return len({block.dtype for block in self._mgr.blocks}) == 1
680707
else:
@@ -685,6 +712,8 @@ def _can_fast_transpose(self) -> bool:
685712
"""
686713
Can we transpose this DataFrame without creating any new array objects.
687714
"""
715+
if isinstance(self._mgr, ArrayManager):
716+
return False
688717
if self._mgr.any_extension_types:
689718
# TODO(EA2D) special case would be unnecessary with 2D EAs
690719
return False
@@ -5506,7 +5535,7 @@ def sort_values( # type: ignore[override]
55065535
)
55075536

55085537
if ignore_index:
5509-
new_data.axes[1] = ibase.default_index(len(indexer))
5538+
new_data.set_axis(1, ibase.default_index(len(indexer)))
55105539

55115540
result = self._constructor(new_data)
55125541
if inplace:
@@ -6051,7 +6080,10 @@ def _dispatch_frame_op(self, right, func, axis: Optional[int] = None):
60516080
# fails in cases with empty columns reached via
60526081
# _frame_arith_method_with_reindex
60536082

6054-
bm = self._mgr.operate_blockwise(right._mgr, array_op)
6083+
# TODO operate_blockwise expects a manager of the same type
6084+
bm = self._mgr.operate_blockwise(
6085+
right._mgr, array_op # type: ignore[arg-type]
6086+
)
60556087
return type(self)(bm)
60566088

60576089
elif isinstance(right, Series) and axis == 1:
@@ -8894,11 +8926,11 @@ def func(values: np.ndarray):
88948926
# We only use this in the case that operates on self.values
88958927
return op(values, axis=axis, skipna=skipna, **kwds)
88968928

8897-
def blk_func(values):
8929+
def blk_func(values, axis=1):
88988930
if isinstance(values, ExtensionArray):
88998931
return values._reduce(name, skipna=skipna, **kwds)
89008932
else:
8901-
return op(values, axis=1, skipna=skipna, **kwds)
8933+
return op(values, axis=axis, skipna=skipna, **kwds)
89028934

89038935
def _get_data() -> DataFrame:
89048936
if filter_type is None:

pandas/core/generic.py

+20-9
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
IndexLabel,
4646
JSONSerializable,
4747
Level,
48+
Manager,
4849
NpDtype,
4950
Renamer,
5051
StorageOptions,
@@ -102,7 +103,7 @@
102103
RangeIndex,
103104
ensure_index,
104105
)
105-
from pandas.core.internals import BlockManager
106+
from pandas.core.internals import ArrayManager, BlockManager
106107
from pandas.core.missing import find_valid_index
107108
from pandas.core.ops import align_method_FRAME
108109
from pandas.core.shared_docs import _shared_docs
@@ -179,7 +180,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin):
179180
)
180181
_metadata: List[str] = []
181182
_is_copy = None
182-
_mgr: BlockManager
183+
_mgr: Manager
183184
_attrs: Dict[Optional[Hashable], Any]
184185
_typ: str
185186

@@ -188,7 +189,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin):
188189

189190
def __init__(
190191
self,
191-
data: BlockManager,
192+
data: Manager,
192193
copy: bool = False,
193194
attrs: Optional[Mapping[Optional[Hashable], Any]] = None,
194195
):
@@ -207,7 +208,7 @@ def __init__(
207208
@classmethod
208209
def _init_mgr(
209210
cls, mgr, axes, dtype: Optional[Dtype] = None, copy: bool = False
210-
) -> BlockManager:
211+
) -> Manager:
211212
""" passed a manager and a axes dict """
212213
for a, axe in axes.items():
213214
if axe is not None:
@@ -220,7 +221,13 @@ def _init_mgr(
220221
mgr = mgr.copy()
221222
if dtype is not None:
222223
# avoid further copies if we can
223-
if len(mgr.blocks) > 1 or mgr.blocks[0].values.dtype != dtype:
224+
if (
225+
isinstance(mgr, BlockManager)
226+
and len(mgr.blocks) == 1
227+
and mgr.blocks[0].values.dtype == dtype
228+
):
229+
pass
230+
else:
224231
mgr = mgr.astype(dtype=dtype)
225232
return mgr
226233

@@ -4544,11 +4551,11 @@ def sort_index(
45444551
new_data = self._mgr.take(indexer, axis=baxis, verify=False)
45454552

45464553
# reconstruct axis if needed
4547-
new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic()
4554+
new_data.set_axis(baxis, new_data.axes[baxis]._sort_levels_monotonic())
45484555

45494556
if ignore_index:
45504557
axis = 1 if isinstance(self, ABCDataFrame) else 0
4551-
new_data.axes[axis] = ibase.default_index(len(indexer))
4558+
new_data.set_axis(axis, ibase.default_index(len(indexer)))
45524559

45534560
result = self._constructor(new_data)
45544561

@@ -5521,6 +5528,8 @@ def _protect_consolidate(self, f):
55215528
Consolidate _mgr -- if the blocks have changed, then clear the
55225529
cache
55235530
"""
5531+
if isinstance(self._mgr, ArrayManager):
5532+
return f()
55245533
blocks_before = len(self._mgr.blocks)
55255534
result = f()
55265535
if len(self._mgr.blocks) != blocks_before:
@@ -5710,11 +5719,13 @@ def _to_dict_of_blocks(self, copy: bool_t = True):
57105719
Return a dict of dtype -> Constructor Types that
57115720
each is a homogeneous dtype.
57125721
5713-
Internal ONLY
5722+
Internal ONLY - only works for BlockManager
57145723
"""
5724+
mgr = self._mgr
5725+
mgr = cast(BlockManager, mgr)
57155726
return {
57165727
k: self._constructor(v).__finalize__(self)
5717-
for k, v, in self._mgr.to_dict(copy=copy).items()
5728+
for k, v, in mgr.to_dict(copy=copy).items()
57185729
}
57195730

57205731
def astype(

pandas/core/groupby/generic.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -1086,10 +1086,12 @@ def py_fallback(bvalues: ArrayLike) -> ArrayLike:
10861086
# in the operation. We un-split here.
10871087
result = result._consolidate()
10881088
assert isinstance(result, (Series, DataFrame)) # for mypy
1089-
assert len(result._mgr.blocks) == 1
1089+
mgr = result._mgr
1090+
assert isinstance(mgr, BlockManager)
1091+
assert len(mgr.blocks) == 1
10901092

10911093
# unwrap DataFrame to get array
1092-
result = result._mgr.blocks[0].values
1094+
result = mgr.blocks[0].values
10931095
return result
10941096

10951097
def blk_func(bvalues: ArrayLike) -> ArrayLike:

pandas/core/internals/__init__.py

+4
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from pandas.core.internals.array_manager import ArrayManager
2+
from pandas.core.internals.base import DataManager
13
from pandas.core.internals.blocks import ( # io.pytables, io.packers
24
Block,
35
BoolBlock,
@@ -35,6 +37,8 @@
3537
"TimeDeltaBlock",
3638
"safe_reshape",
3739
"make_block",
40+
"DataManager",
41+
"ArrayManager",
3842
"BlockManager",
3943
"SingleBlockManager",
4044
"concatenate_block_managers",

0 commit comments

Comments
 (0)