diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ca488190a8704..207e3e7635cac 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -259,7 +259,7 @@ from pandas.core.groupby.generic import DataFrameGroupBy from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg - from pandas.core.internals import SingleDataManager + from pandas.core.internals.managers import SingleBlockManager from pandas.io.formats.style import Styler @@ -4530,14 +4530,14 @@ def _ensure_valid_index(self, value) -> None: self._mgr = self._mgr.reindex_axis(index_copy, axis=1, fill_value=np.nan) - def _box_col_values(self, values: SingleDataManager, loc: int) -> Series: + def _box_col_values(self, values: SingleBlockManager, loc: int) -> Series: """ Provide boxed values for a column. """ # Lookup in columns so that if e.g. a str datetime was passed # we attach the Timestamp object as the name. name = self.columns[loc] - # We get index=self.index bc values is a SingleDataManager + # We get index=self.index bc values is a SingleBlockManager obj = self._constructor_sliced_from_mgr(values, axes=values.axes) obj._name = name return obj.__finalize__(self) diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index da394e783be4b..fb14c5ad82f4f 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -1,8 +1,4 @@ from pandas.core.internals.api import make_block # 2023-09-18 pyarrow uses this -from pandas.core.internals.base import ( - DataManager, - SingleDataManager, -) from pandas.core.internals.concat import concatenate_managers from pandas.core.internals.managers import ( BlockManager, @@ -14,9 +10,7 @@ "DatetimeTZBlock", # pylint: disable=undefined-all-variable "ExtensionBlock", # pylint: disable=undefined-all-variable "make_block", - "DataManager", "BlockManager", - "SingleDataManager", "SingleBlockManager", "concatenate_managers", ] diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py deleted file mode 100644 index d6d588d5e2492..0000000000000 --- a/pandas/core/internals/base.py +++ /dev/null @@ -1,406 +0,0 @@ -""" -Base class for the internal managers. BlockManager inherits from this class. -""" -from __future__ import annotations - -from typing import ( - TYPE_CHECKING, - Any, - Literal, - cast, - final, -) - -import numpy as np - -from pandas._config import ( - using_copy_on_write, - warn_copy_on_write, -) - -from pandas._libs import ( - algos as libalgos, - lib, -) -from pandas.errors import AbstractMethodError -from pandas.util._validators import validate_bool_kwarg - -from pandas.core.dtypes.cast import ( - find_common_type, - np_can_hold_element, -) -from pandas.core.dtypes.dtypes import ( - ExtensionDtype, - SparseDtype, -) - -from pandas.core.base import PandasObject -from pandas.core.construction import extract_array -from pandas.core.indexes.api import ( - Index, - default_index, -) - -if TYPE_CHECKING: - from pandas._typing import ( - ArrayLike, - AxisInt, - DtypeObj, - Self, - Shape, - ) - - -class _AlreadyWarned: - def __init__(self) -> None: - # This class is used on the manager level to the block level to - # ensure that we warn only once. The block method can update the - # warned_already option without returning a value to keep the - # interface consistent. This is only a temporary solution for - # CoW warnings. - self.warned_already = False - - -class DataManager(PandasObject): - # TODO share more methods/attributes - - axes: list[Index] - - @property - def items(self) -> Index: - raise AbstractMethodError(self) - - @final - def __len__(self) -> int: - return len(self.items) - - @property - def ndim(self) -> int: - return len(self.axes) - - @property - def shape(self) -> Shape: - return tuple(len(ax) for ax in self.axes) - - @final - def _validate_set_axis(self, axis: AxisInt, new_labels: Index) -> None: - # Caller is responsible for ensuring we have an Index object. - old_len = len(self.axes[axis]) - new_len = len(new_labels) - - if axis == 1 and len(self.items) == 0: - # If we are setting the index on a DataFrame with no columns, - # it is OK to change the length. - pass - - elif new_len != old_len: - raise ValueError( - f"Length mismatch: Expected axis has {old_len} elements, new " - f"values have {new_len} elements" - ) - - def reindex_indexer( - self, - new_axis, - indexer, - axis: AxisInt, - fill_value=None, - allow_dups: bool = False, - copy: bool = True, - only_slice: bool = False, - ) -> Self: - raise AbstractMethodError(self) - - @final - def reindex_axis( - self, - new_index: Index, - axis: AxisInt, - fill_value=None, - only_slice: bool = False, - ) -> Self: - """ - Conform data manager to new index. - """ - new_index, indexer = self.axes[axis].reindex(new_index) - - return self.reindex_indexer( - new_index, - indexer, - axis=axis, - fill_value=fill_value, - copy=False, - only_slice=only_slice, - ) - - def _equal_values(self, other: Self) -> bool: - """ - To be implemented by the subclasses. Only check the column values - assuming shape and indexes have already been checked. - """ - raise AbstractMethodError(self) - - @final - def equals(self, other: object) -> bool: - """ - Implementation for DataFrame.equals - """ - if not isinstance(other, type(self)): - return False - - self_axes, other_axes = self.axes, other.axes - if len(self_axes) != len(other_axes): - return False - if not all(ax1.equals(ax2) for ax1, ax2 in zip(self_axes, other_axes)): - return False - - return self._equal_values(other) - - def apply( - self, - f, - align_keys: list[str] | None = None, - **kwargs, - ) -> Self: - raise AbstractMethodError(self) - - def apply_with_block( - self, - f, - align_keys: list[str] | None = None, - **kwargs, - ) -> Self: - raise AbstractMethodError(self) - - @final - def isna(self, func) -> Self: - return self.apply("apply", func=func) - - @final - def fillna(self, value, limit: int | None, inplace: bool, downcast) -> Self: - if limit is not None: - # Do this validation even if we go through one of the no-op paths - limit = libalgos.validate_limit(None, limit=limit) - - return self.apply( - "fillna", - value=value, - limit=limit, - inplace=inplace, - downcast=downcast, - using_cow=using_copy_on_write(), - already_warned=_AlreadyWarned(), - ) - - @final - def where(self, other, cond, align: bool) -> Self: - if align: - align_keys = ["other", "cond"] - else: - align_keys = ["cond"] - other = extract_array(other, extract_numpy=True) - - return self.apply( - "where", - align_keys=align_keys, - other=other, - cond=cond, - using_cow=using_copy_on_write(), - ) - - @final - def putmask(self, mask, new, align: bool = True, warn: bool = True) -> Self: - if align: - align_keys = ["new", "mask"] - else: - align_keys = ["mask"] - new = extract_array(new, extract_numpy=True) - - already_warned = None - if warn_copy_on_write(): - already_warned = _AlreadyWarned() - if not warn: - already_warned.warned_already = True - - return self.apply( - "putmask", - align_keys=align_keys, - mask=mask, - new=new, - using_cow=using_copy_on_write(), - already_warned=already_warned, - ) - - @final - def round(self, decimals: int, using_cow: bool = False) -> Self: - return self.apply( - "round", - decimals=decimals, - using_cow=using_cow, - ) - - @final - def replace(self, to_replace, value, inplace: bool) -> Self: - inplace = validate_bool_kwarg(inplace, "inplace") - # NDFrame.replace ensures the not-is_list_likes here - assert not lib.is_list_like(to_replace) - assert not lib.is_list_like(value) - return self.apply( - "replace", - to_replace=to_replace, - value=value, - inplace=inplace, - using_cow=using_copy_on_write(), - already_warned=_AlreadyWarned(), - ) - - @final - def replace_regex(self, **kwargs) -> Self: - return self.apply( - "_replace_regex", - **kwargs, - using_cow=using_copy_on_write(), - already_warned=_AlreadyWarned(), - ) - - @final - def replace_list( - self, - src_list: list[Any], - dest_list: list[Any], - inplace: bool = False, - regex: bool = False, - ) -> Self: - """do a list replace""" - inplace = validate_bool_kwarg(inplace, "inplace") - - bm = self.apply( - "replace_list", - src_list=src_list, - dest_list=dest_list, - inplace=inplace, - regex=regex, - using_cow=using_copy_on_write(), - already_warned=_AlreadyWarned(), - ) - bm._consolidate_inplace() - return bm - - def interpolate(self, inplace: bool, **kwargs) -> Self: - return self.apply( - "interpolate", - inplace=inplace, - **kwargs, - using_cow=using_copy_on_write(), - already_warned=_AlreadyWarned(), - ) - - def pad_or_backfill(self, inplace: bool, **kwargs) -> Self: - return self.apply( - "pad_or_backfill", - inplace=inplace, - **kwargs, - using_cow=using_copy_on_write(), - already_warned=_AlreadyWarned(), - ) - - def shift(self, periods: int, fill_value) -> Self: - if fill_value is lib.no_default: - fill_value = None - - return self.apply("shift", periods=periods, fill_value=fill_value) - - # -------------------------------------------------------------------- - # Consolidation: No-ops for all but BlockManager - - def is_consolidated(self) -> bool: - return True - - def consolidate(self) -> Self: - return self - - def _consolidate_inplace(self) -> None: - return - - -class SingleDataManager(DataManager): - @property - def ndim(self) -> Literal[1]: - return 1 - - @final - @property - def array(self) -> ArrayLike: - """ - Quick access to the backing array of the Block. - """ - # error: "SingleDataManager" has no attribute "arrays"; maybe "array" - return self.arrays[0] # type: ignore[attr-defined] - - def setitem_inplace(self, indexer, value, warn: bool = True) -> None: - """ - Set values with indexer. - - For SingleBlockManager, this backs s[indexer] = value - - This is an inplace version of `setitem()`, mutating the manager/values - in place, not returning a new Manager (and Block), and thus never changing - the dtype. - """ - arr = self.array - - # EAs will do this validation in their own __setitem__ methods. - if isinstance(arr, np.ndarray): - # Note: checking for ndarray instead of np.dtype means we exclude - # dt64/td64, which do their own validation. - value = np_can_hold_element(arr.dtype, value) - - if isinstance(value, np.ndarray) and value.ndim == 1 and len(value) == 1: - # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 - value = value[0, ...] - - arr[indexer] = value - - def grouped_reduce(self, func): - arr = self.array - res = func(arr) - index = default_index(len(res)) - - mgr = type(self).from_array(res, index) - return mgr - - @classmethod - def from_array(cls, arr: ArrayLike, index: Index): - raise AbstractMethodError(cls) - - -def interleaved_dtype(dtypes: list[DtypeObj]) -> DtypeObj | None: - """ - Find the common dtype for `blocks`. - - Parameters - ---------- - blocks : List[DtypeObj] - - Returns - ------- - dtype : np.dtype, ExtensionDtype, or None - None is returned when `blocks` is empty. - """ - if not len(dtypes): - return None - - return find_common_type(dtypes) - - -def ensure_np_dtype(dtype: DtypeObj) -> np.dtype: - # TODO: https://github.com/pandas-dev/pandas/issues/22791 - # Give EAs some input on what happens here. Sparse needs this. - if isinstance(dtype, SparseDtype): - dtype = dtype.subtype - dtype = cast(np.dtype, dtype) - elif isinstance(dtype, ExtensionDtype): - dtype = np.dtype("object") - elif dtype == np.dtype(str): - dtype = np.dtype("object") - return dtype diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index c6a8b61e0c51e..e1a18cb79a1d6 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -7,9 +7,11 @@ import itertools from typing import ( TYPE_CHECKING, + Any, Callable, Literal, cast, + final, ) import warnings import weakref @@ -22,6 +24,7 @@ ) from pandas._libs import ( + algos as libalgos, internals as libinternals, lib, ) @@ -30,11 +33,19 @@ BlockValuesRefs, ) from pandas._libs.tslibs import Timestamp -from pandas.errors import PerformanceWarning +from pandas.errors import ( + AbstractMethodError, + PerformanceWarning, +) from pandas.util._decorators import cache_readonly from pandas.util._exceptions import find_stack_level +from pandas.util._validators import validate_bool_kwarg -from pandas.core.dtypes.cast import infer_dtype_from_scalar +from pandas.core.dtypes.cast import ( + find_common_type, + infer_dtype_from_scalar, + np_can_hold_element, +) from pandas.core.dtypes.common import ( ensure_platform_int, is_1d_only_ea_dtype, @@ -43,6 +54,7 @@ from pandas.core.dtypes.dtypes import ( DatetimeTZDtype, ExtensionDtype, + SparseDtype, ) from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -60,6 +72,7 @@ DatetimeArray, ) from pandas.core.arrays._mixins import NDArrayBackedExtensionArray +from pandas.core.base import PandasObject from pandas.core.construction import ( ensure_wrapped_if_datetimelike, extract_array, @@ -67,14 +80,9 @@ from pandas.core.indexers import maybe_convert_indices from pandas.core.indexes.api import ( Index, + default_index, ensure_index, ) -from pandas.core.internals.base import ( - DataManager, - SingleDataManager, - ensure_np_dtype, - interleaved_dtype, -) from pandas.core.internals.blocks import ( COW_WARNING_GENERAL_MSG, COW_WARNING_SETITEM_MSG, @@ -106,7 +114,49 @@ from pandas.api.extensions import ExtensionArray -class BaseBlockManager(DataManager): +def interleaved_dtype(dtypes: list[DtypeObj]) -> DtypeObj | None: + """ + Find the common dtype for `blocks`. + + Parameters + ---------- + blocks : List[DtypeObj] + + Returns + ------- + dtype : np.dtype, ExtensionDtype, or None + None is returned when `blocks` is empty. + """ + if not len(dtypes): + return None + + return find_common_type(dtypes) + + +def ensure_np_dtype(dtype: DtypeObj) -> np.dtype: + # TODO: https://github.com/pandas-dev/pandas/issues/22791 + # Give EAs some input on what happens here. Sparse needs this. + if isinstance(dtype, SparseDtype): + dtype = dtype.subtype + dtype = cast(np.dtype, dtype) + elif isinstance(dtype, ExtensionDtype): + dtype = np.dtype("object") + elif dtype == np.dtype(str): + dtype = np.dtype("object") + return dtype + + +class _AlreadyWarned: + def __init__(self) -> None: + # This class is used on the manager level to the block level to + # ensure that we warn only once. The block method can update the + # warned_already option without returning a value to keep the + # interface consistent. This is only a temporary solution for + # CoW warnings. + self.warned_already = False + + +class BaseBlockManager(PandasObject): """ Core internal data structure to implement DataFrame, Series, etc. @@ -174,6 +224,14 @@ def ndim(self) -> int: def __init__(self, blocks, axes, verify_integrity: bool = True) -> None: raise NotImplementedError + @final + def __len__(self) -> int: + return len(self.items) + + @property + def shape(self) -> Shape: + return tuple(len(ax) for ax in self.axes) + @classmethod def from_blocks(cls, blocks: list[Block], axes: list[Index]) -> Self: raise NotImplementedError @@ -239,6 +297,23 @@ def set_axis(self, axis: AxisInt, new_labels: Index) -> None: self._validate_set_axis(axis, new_labels) self.axes[axis] = new_labels + @final + def _validate_set_axis(self, axis: AxisInt, new_labels: Index) -> None: + # Caller is responsible for ensuring we have an Index object. + old_len = len(self.axes[axis]) + new_len = len(new_labels) + + if axis == 1 and len(self.items) == 0: + # If we are setting the index on a DataFrame with no columns, + # it is OK to change the length. + pass + + elif new_len != old_len: + raise ValueError( + f"Length mismatch: Expected axis has {old_len} elements, new " + f"values have {new_len} elements" + ) + @property def is_single_block(self) -> bool: # Assumes we are 2D; overridden by SingleBlockManager @@ -315,6 +390,29 @@ def __repr__(self) -> str: output += f"\n{block}" return output + def _equal_values(self, other: Self) -> bool: + """ + To be implemented by the subclasses. Only check the column values + assuming shape and indexes have already been checked. + """ + raise AbstractMethodError(self) + + @final + def equals(self, other: object) -> bool: + """ + Implementation for DataFrame.equals + """ + if not isinstance(other, type(self)): + return False + + self_axes, other_axes = self.axes, other.axes + if len(self_axes) != len(other_axes): + return False + if not all(ax1.equals(ax2) for ax1, ax2 in zip(self_axes, other_axes)): + return False + + return self._equal_values(other) + def apply( self, f, @@ -367,6 +465,152 @@ def apply( out = type(self).from_blocks(result_blocks, self.axes) return out + def apply_with_block( + self, + f, + align_keys: list[str] | None = None, + **kwargs, + ) -> Self: + raise AbstractMethodError(self) + + @final + def isna(self, func) -> Self: + return self.apply("apply", func=func) + + @final + def fillna(self, value, limit: int | None, inplace: bool, downcast) -> Self: + if limit is not None: + # Do this validation even if we go through one of the no-op paths + limit = libalgos.validate_limit(None, limit=limit) + + return self.apply( + "fillna", + value=value, + limit=limit, + inplace=inplace, + downcast=downcast, + using_cow=using_copy_on_write(), + already_warned=_AlreadyWarned(), + ) + + @final + def where(self, other, cond, align: bool) -> Self: + if align: + align_keys = ["other", "cond"] + else: + align_keys = ["cond"] + other = extract_array(other, extract_numpy=True) + + return self.apply( + "where", + align_keys=align_keys, + other=other, + cond=cond, + using_cow=using_copy_on_write(), + ) + + @final + def putmask(self, mask, new, align: bool = True, warn: bool = True) -> Self: + if align: + align_keys = ["new", "mask"] + else: + align_keys = ["mask"] + new = extract_array(new, extract_numpy=True) + + already_warned = None + if warn_copy_on_write(): + already_warned = _AlreadyWarned() + if not warn: + already_warned.warned_already = True + + return self.apply( + "putmask", + align_keys=align_keys, + mask=mask, + new=new, + using_cow=using_copy_on_write(), + already_warned=already_warned, + ) + + @final + def round(self, decimals: int, using_cow: bool = False) -> Self: + return self.apply( + "round", + decimals=decimals, + using_cow=using_cow, + ) + + @final + def replace(self, to_replace, value, inplace: bool) -> Self: + inplace = validate_bool_kwarg(inplace, "inplace") + # NDFrame.replace ensures the not-is_list_likes here + assert not lib.is_list_like(to_replace) + assert not lib.is_list_like(value) + return self.apply( + "replace", + to_replace=to_replace, + value=value, + inplace=inplace, + using_cow=using_copy_on_write(), + already_warned=_AlreadyWarned(), + ) + + @final + def replace_regex(self, **kwargs) -> Self: + return self.apply( + "_replace_regex", + **kwargs, + using_cow=using_copy_on_write(), + already_warned=_AlreadyWarned(), + ) + + @final + def replace_list( + self, + src_list: list[Any], + dest_list: list[Any], + inplace: bool = False, + regex: bool = False, + ) -> Self: + """do a list replace""" + inplace = validate_bool_kwarg(inplace, "inplace") + + bm = self.apply( + "replace_list", + src_list=src_list, + dest_list=dest_list, + inplace=inplace, + regex=regex, + using_cow=using_copy_on_write(), + already_warned=_AlreadyWarned(), + ) + bm._consolidate_inplace() + return bm + + def interpolate(self, inplace: bool, **kwargs) -> Self: + return self.apply( + "interpolate", + inplace=inplace, + **kwargs, + using_cow=using_copy_on_write(), + already_warned=_AlreadyWarned(), + ) + + def pad_or_backfill(self, inplace: bool, **kwargs) -> Self: + return self.apply( + "pad_or_backfill", + inplace=inplace, + **kwargs, + using_cow=using_copy_on_write(), + already_warned=_AlreadyWarned(), + ) + + def shift(self, periods: int, fill_value) -> Self: + if fill_value is lib.no_default: + fill_value = None + + return self.apply("shift", periods=periods, fill_value=fill_value) + def setitem(self, indexer, value, warn: bool = True) -> Self: """ Set values with indexer. @@ -602,6 +846,9 @@ def copy_func(ax): res._consolidate_inplace() return res + def is_consolidated(self) -> bool: + return True + def consolidate(self) -> Self: """ Join together blocks having same dtype @@ -618,6 +865,31 @@ def consolidate(self) -> Self: bm._consolidate_inplace() return bm + def _consolidate_inplace(self) -> None: + return + + @final + def reindex_axis( + self, + new_index: Index, + axis: AxisInt, + fill_value=None, + only_slice: bool = False, + ) -> Self: + """ + Conform data manager to new index. + """ + new_index, indexer = self.axes[axis].reindex(new_index) + + return self.reindex_indexer( + new_index, + indexer, + axis=axis, + fill_value=fill_value, + copy=False, + only_slice=only_slice, + ) + def reindex_indexer( self, new_axis: Index, @@ -1820,7 +2092,7 @@ def concat_vertical(cls, mgrs: list[Self], axes: list[Index]) -> Self: raise NotImplementedError("This logic lives (for now) in internals.concat") -class SingleBlockManager(BaseBlockManager, SingleDataManager): +class SingleBlockManager(BaseBlockManager): """manage a single block with""" @property @@ -1939,6 +2211,14 @@ def _post_setstate(self) -> None: def _block(self) -> Block: return self.blocks[0] + @final + @property + def array(self) -> ArrayLike: + """ + Quick access to the backing array of the Block. + """ + return self.arrays[0] + # error: Cannot override writeable attribute with read-only property @property def _blknos(self) -> None: # type: ignore[override] @@ -2041,7 +2321,19 @@ def setitem_inplace(self, indexer, value, warn: bool = True) -> None: stacklevel=find_stack_level(), ) - super().setitem_inplace(indexer, value) + arr = self.array + + # EAs will do this validation in their own __setitem__ methods. + if isinstance(arr, np.ndarray): + # Note: checking for ndarray instead of np.dtype means we exclude + # dt64/td64, which do their own validation. + value = np_can_hold_element(arr.dtype, value) + + if isinstance(value, np.ndarray) and value.ndim == 1 and len(value) == 1: + # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 + value = value[0, ...] + + arr[indexer] = value def idelete(self, indexer) -> SingleBlockManager: """ @@ -2087,6 +2379,14 @@ def _equal_values(self, other: Self) -> bool: right = other.blocks[0].values return array_equals(left, right) + def grouped_reduce(self, func): + arr = self.array + res = func(arr) + index = default_index(len(res)) + + mgr = type(self).from_array(res, index) + return mgr + # -------------------------------------------------------------------- # Constructor Helpers diff --git a/pandas/tests/internals/test_api.py b/pandas/tests/internals/test_api.py index ccd7222fb16e1..7c1d3ff774d0a 100644 --- a/pandas/tests/internals/test_api.py +++ b/pandas/tests/internals/test_api.py @@ -23,15 +23,12 @@ def test_namespace(): "concat", "managers", "construction", - "base", "api", "ops", ] expected = [ "make_block", - "DataManager", "BlockManager", - "SingleDataManager", "SingleBlockManager", "concatenate_managers", ]