diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index af1350f088b7a..e04abd11697c3 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -1,4 +1,7 @@ -from pandas.core.internals.api import make_block # pseudo-public version +from pandas.core.internals.api import ( + create_block_manager_from_arrays, + make_block, +) from pandas.core.internals.array_manager import ( ArrayManager, SingleArrayManager, @@ -18,7 +21,6 @@ from pandas.core.internals.managers import ( BlockManager, SingleBlockManager, - create_block_manager_from_arrays, create_block_manager_from_blocks, ) diff --git a/pandas/core/internals/api.py b/pandas/core/internals/api.py index 37e07af71213e..e909fbd541d6d 100644 --- a/pandas/core/internals/api.py +++ b/pandas/core/internals/api.py @@ -8,10 +8,16 @@ """ from __future__ import annotations +from collections import defaultdict +from typing import DefaultDict + import numpy as np from pandas._libs.internals import BlockPlacement -from pandas._typing import Dtype +from pandas._typing import ( + ArrayLike, + Dtype, +) from pandas.core.dtypes.common import ( is_datetime64tz_dtype, @@ -20,14 +26,24 @@ from pandas.core.arrays import DatetimeArray from pandas.core.construction import extract_array +from pandas.core.indexes.api import Index from pandas.core.internals.blocks import ( Block, + CategoricalBlock, DatetimeTZBlock, + ExtensionBlock, check_ndim, ensure_block_shape, extract_pandas_array, get_block_type, maybe_coerce_values, + new_block, +) +from pandas.core.internals.managers import ( + BlockManager, + construction_error, + multi_blockify, + simple_blockify, ) @@ -86,3 +102,110 @@ def maybe_infer_ndim(values, placement: BlockPlacement, ndim: int | None) -> int else: ndim = values.ndim return ndim + + +def create_block_manager_from_arrays( + arrays, + names: Index, + axes: list[Index], + consolidate: bool = True, +) -> BlockManager: + # Assertions disabled for performance + # assert isinstance(names, Index) + # assert isinstance(axes, list) + # assert all(isinstance(x, Index) for x in axes) + + arrays = [extract_array(x, extract_numpy=True) for x in arrays] + + try: + blocks = _form_blocks(arrays, names, axes, consolidate) + mgr = BlockManager(blocks, axes) + except ValueError as e: + raise construction_error(len(arrays), arrays[0].shape, axes, e) + if consolidate: + mgr._consolidate_inplace() + return mgr + + +def _form_blocks( + arrays: list[ArrayLike], names: Index, axes: list[Index], consolidate: bool +) -> list[Block]: + # put "leftover" items in float bucket, where else? + # generalize? + items_dict: DefaultDict[str, list] = defaultdict(list) + extra_locs = [] + + names_idx = names + if names_idx.equals(axes[0]): + names_indexer = np.arange(len(names_idx)) + else: + # Assertion disabled for performance + # assert names_idx.intersection(axes[0]).is_unique + names_indexer = names_idx.get_indexer_for(axes[0]) + + for i, name_idx in enumerate(names_indexer): + if name_idx == -1: + extra_locs.append(i) + continue + + v = arrays[name_idx] + + block_type = get_block_type(v) + items_dict[block_type.__name__].append((i, v)) + + blocks: list[Block] = [] + if len(items_dict["NumericBlock"]): + numeric_blocks = multi_blockify( + items_dict["NumericBlock"], consolidate=consolidate + ) + blocks.extend(numeric_blocks) + + if len(items_dict["DatetimeLikeBlock"]): + dtlike_blocks = multi_blockify( + items_dict["DatetimeLikeBlock"], consolidate=consolidate + ) + blocks.extend(dtlike_blocks) + + if len(items_dict["DatetimeTZBlock"]): + dttz_blocks = [ + DatetimeTZBlock( + ensure_block_shape(extract_array(array), 2), + placement=BlockPlacement(i), + ndim=2, + ) + for i, array in items_dict["DatetimeTZBlock"] + ] + blocks.extend(dttz_blocks) + + if len(items_dict["ObjectBlock"]) > 0: + object_blocks = simple_blockify( + items_dict["ObjectBlock"], np.object_, consolidate=consolidate + ) + blocks.extend(object_blocks) + + if len(items_dict["CategoricalBlock"]) > 0: + cat_blocks = [ + CategoricalBlock(array, placement=BlockPlacement(i), ndim=2) + for i, array in items_dict["CategoricalBlock"] + ] + blocks.extend(cat_blocks) + + if len(items_dict["ExtensionBlock"]): + external_blocks = [ + ExtensionBlock(array, placement=BlockPlacement(i), ndim=2) + for i, array in items_dict["ExtensionBlock"] + ] + + blocks.extend(external_blocks) + + if len(extra_locs): + shape = (len(extra_locs),) + tuple(len(x) for x in axes[1:]) + + # empty items -> dtype object + block_values = np.empty(shape, dtype=object) + block_values.fill(np.nan) + + na_block = new_block(block_values, placement=extra_locs, ndim=2) + blocks.append(na_block) + + return blocks diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index a859245b5a9fa..b7aa001f2ccee 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -85,8 +85,8 @@ from pandas.core.internals.managers import ( BlockManager, SingleBlockManager, - create_block_manager_from_arrays, create_block_manager_from_blocks, + create_block_manager_from_column_arrays, ) if TYPE_CHECKING: @@ -131,8 +131,8 @@ def arrays_to_mgr( axes = [columns, index] if typ == "block": - return create_block_manager_from_arrays( - arrays, columns, axes, consolidate=consolidate + return create_block_manager_from_column_arrays( + arrays, axes, consolidate=consolidate ) elif typ == "array": if len(columns) != len(arrays): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index cffa134fd766e..7b2638fd9cd24 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1791,21 +1791,19 @@ def create_block_manager_from_blocks( return mgr -def create_block_manager_from_arrays( +def create_block_manager_from_column_arrays( arrays, - names: Index, axes: list[Index], consolidate: bool = True, ) -> BlockManager: # Assertions disabled for performance - # assert isinstance(names, Index) # assert isinstance(axes, list) # assert all(isinstance(x, Index) for x in axes) arrays = [extract_array(x, extract_numpy=True) for x in arrays] try: - blocks = _form_blocks(arrays, names, axes, consolidate) + blocks = _form_blocks(arrays, consolidate) mgr = BlockManager(blocks, axes) except ValueError as e: raise construction_error(len(arrays), arrays[0].shape, axes, e) @@ -1843,26 +1841,11 @@ def construction_error( # ----------------------------------------------------------------------- -def _form_blocks( - arrays: list[ArrayLike], names: Index, axes: list[Index], consolidate: bool -) -> list[Block]: - # put "leftover" items in float bucket, where else? - # generalize? - items_dict: DefaultDict[str, list] = defaultdict(list) - extra_locs = [] +def _form_blocks(arrays: list[ArrayLike], consolidate: bool) -> list[Block]: - names_idx = names - if names_idx.equals(axes[0]): - names_indexer = np.arange(len(names_idx)) - else: - # Assertion disabled for performance - # assert names_idx.intersection(axes[0]).is_unique - names_indexer = names_idx.get_indexer_for(axes[0]) + items_dict: DefaultDict[str, list] = defaultdict(list) - for i, name_idx in enumerate(names_indexer): - if name_idx == -1: - extra_locs.append(i) - continue + for i, name_idx in enumerate(range(len(arrays))): v = arrays[name_idx] @@ -1871,13 +1854,13 @@ def _form_blocks( blocks: list[Block] = [] if len(items_dict["NumericBlock"]): - numeric_blocks = _multi_blockify( + numeric_blocks = multi_blockify( items_dict["NumericBlock"], consolidate=consolidate ) blocks.extend(numeric_blocks) if len(items_dict["DatetimeLikeBlock"]): - dtlike_blocks = _multi_blockify( + dtlike_blocks = multi_blockify( items_dict["DatetimeLikeBlock"], consolidate=consolidate ) blocks.extend(dtlike_blocks) @@ -1894,7 +1877,7 @@ def _form_blocks( blocks.extend(dttz_blocks) if len(items_dict["ObjectBlock"]) > 0: - object_blocks = _simple_blockify( + object_blocks = simple_blockify( items_dict["ObjectBlock"], np.object_, consolidate=consolidate ) blocks.extend(object_blocks) @@ -1914,20 +1897,10 @@ def _form_blocks( blocks.extend(external_blocks) - if len(extra_locs): - shape = (len(extra_locs),) + tuple(len(x) for x in axes[1:]) - - # empty items -> dtype object - block_values = np.empty(shape, dtype=object) - block_values.fill(np.nan) - - na_block = new_block(block_values, placement=extra_locs, ndim=2) - blocks.append(na_block) - return blocks -def _simple_blockify(tuples, dtype, consolidate: bool) -> list[Block]: +def simple_blockify(tuples, dtype, consolidate: bool) -> list[Block]: """ return a single array of a block that has a single dtype; if dtype is not None, coerce to this dtype @@ -1945,7 +1918,7 @@ def _simple_blockify(tuples, dtype, consolidate: bool) -> list[Block]: return [block] -def _multi_blockify(tuples, dtype: DtypeObj | None = None, consolidate: bool = True): +def multi_blockify(tuples, dtype: DtypeObj | None = None, consolidate: bool = True): """return an array of blocks that potentially have different dtypes""" if not consolidate: