diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 06a06484b921a..a84de6cda0432 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -165,6 +165,9 @@ jobs: pytest pandas/tests/resample/ pytest pandas/tests/reshape/merge + pytest pandas/tests/series/methods + pytest pandas/tests/series/test_* + # indexing subset (temporary since other tests don't pass yet) pytest pandas/tests/frame/indexing/test_indexing.py::TestDataFrameIndexing::test_setitem_boolean pytest pandas/tests/frame/indexing/test_where.py diff --git a/pandas/_typing.py b/pandas/_typing.py index c50d532f40dd7..e464f2a021ef6 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -58,6 +58,8 @@ from pandas.core.internals import ( ArrayManager, BlockManager, + SingleArrayManager, + SingleBlockManager, ) from pandas.core.resample import Resampler from pandas.core.series import Series @@ -184,3 +186,4 @@ # internals Manager = Union["ArrayManager", "BlockManager"] +SingleManager = Union["SingleArrayManager", "SingleBlockManager"] diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 9ba9a5bd38164..43900709ad11f 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -481,7 +481,6 @@ def sanitize_array( DataFrame constructor, as the dtype keyword there may be interpreted as only applying to a subset of columns, see GH#24435. """ - if isinstance(data, ma.MaskedArray): data = sanitize_masked_array(data) @@ -555,6 +554,7 @@ def sanitize_array( inferred = lib.infer_dtype(subarr, skipna=False) if inferred in {"interval", "period"}: subarr = array(subarr) + subarr = extract_array(subarr, extract_numpy=True) return subarr diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5bba7ab67b2bf..205aebbf4124a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -138,6 +138,7 @@ from pandas.core.internals import ( ArrayManager, BlockManager, + SingleArrayManager, ) from pandas.core.internals.construction import mgr_to_mgr from pandas.core.missing import find_valid_index @@ -5563,7 +5564,7 @@ def _protect_consolidate(self, f): Consolidate _mgr -- if the blocks have changed, then clear the cache """ - if isinstance(self._mgr, ArrayManager): + if isinstance(self._mgr, (ArrayManager, SingleArrayManager)): return f() blocks_before = len(self._mgr.blocks) result = f() diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index aaf67fb1be532..2de5e81360a93 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -108,10 +108,7 @@ all_indexes_same, ) import pandas.core.indexes.base as ibase -from pandas.core.internals import ( - ArrayManager, - BlockManager, -) +from pandas.core.internals import ArrayManager from pandas.core.series import Series from pandas.core.util.numba_ import maybe_use_numba @@ -1151,18 +1148,18 @@ def py_fallback(values: ArrayLike) -> ArrayLike: # in the operation. We un-split here. result = result._consolidate() assert isinstance(result, (Series, DataFrame)) # for mypy + # unwrap DataFrame/Series to get array mgr = result._mgr - assert isinstance(mgr, BlockManager) - - # unwrap DataFrame to get array - if len(mgr.blocks) != 1: + arrays = mgr.arrays + if len(arrays) != 1: # We've split an object block! Everything we've assumed # about a single block input returning a single block output # is a lie. See eg GH-39329 return mgr.as_array() else: - result = mgr.blocks[0].values - return result + # We are a single block from a BlockManager + # or one array from SingleArrayManager + return arrays[0] def array_func(values: ArrayLike) -> ArrayLike: diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index bded503a1e6db..a434e94abec5a 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1590,7 +1590,11 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"): # if there is only one block/type, still have to take split path # unless the block is one-dimensional or it can hold the value - if not take_split_path and self.obj._mgr.blocks and self.ndim > 1: + if ( + not take_split_path + and getattr(self.obj._mgr, "blocks", False) + and self.ndim > 1 + ): # in case of dict, keys are indices val = list(value.values()) if isinstance(value, dict) else value blk = self.obj._mgr.blocks[0] diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index 054ce8a40288b..23d35b412e1ae 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -1,5 +1,11 @@ -from pandas.core.internals.array_manager import ArrayManager -from pandas.core.internals.base import DataManager +from pandas.core.internals.array_manager import ( + ArrayManager, + SingleArrayManager, +) +from pandas.core.internals.base import ( + DataManager, + SingleDataManager, +) from pandas.core.internals.blocks import ( # io.pytables, io.packers Block, CategoricalBlock, @@ -34,7 +40,9 @@ "DataManager", "ArrayManager", "BlockManager", + "SingleDataManager", "SingleBlockManager", + "SingleArrayManager", "concatenate_managers", # those two are preserved here for downstream compatibility (GH-33892) "create_block_manager_from_arrays", diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 998f1ffcf02ee..d6f2530ed2ca5 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -4,7 +4,6 @@ from __future__ import annotations from typing import ( - TYPE_CHECKING, Any, Callable, List, @@ -47,6 +46,7 @@ ) from pandas.core.dtypes.generic import ( ABCDataFrame, + ABCPandasArray, ABCSeries, ) from pandas.core.dtypes.missing import ( @@ -58,6 +58,9 @@ from pandas.core.arrays import ( DatetimeArray, ExtensionArray, + IntervalArray, + PandasArray, + PeriodArray, TimedeltaArray, ) from pandas.core.arrays.sparse import SparseDtype @@ -71,13 +74,12 @@ Index, ensure_index, ) -from pandas.core.internals.base import DataManager +from pandas.core.internals.base import ( + DataManager, + SingleDataManager, +) from pandas.core.internals.blocks import make_block -if TYPE_CHECKING: - from pandas.core.internals.managers import SingleBlockManager - - T = TypeVar("T", bound="ArrayManager") @@ -132,7 +134,7 @@ def make_empty(self: T, axes=None) -> T: @property def items(self) -> Index: - return self._axes[1] + return self._axes[-1] @property def axes(self) -> List[Index]: # type: ignore[override] @@ -191,7 +193,8 @@ def get_dtypes(self): def __repr__(self) -> str: output = type(self).__name__ output += f"\nIndex: {self._axes[0]}" - output += f"\nColumns: {self._axes[1]}" + if self.ndim == 1: + output += f"\nColumns: {self._axes[1]}" output += f"\n{len(self.arrays)} arrays:" for arr in self.arrays: output += f"\n{arr.dtype}" @@ -407,12 +410,16 @@ def apply_with_block(self: T, f, align_keys=None, **kwargs) -> T: # The caller is responsible for ensuring that # obj.axes[-1].equals(self.items) if obj.ndim == 1: - kwargs[k] = obj.iloc[[i]] + if self.ndim == 2: + kwargs[k] = obj.iloc[slice(i, i + 1)]._values + else: + kwargs[k] = obj.iloc[:]._values else: kwargs[k] = obj.iloc[:, [i]]._values else: # otherwise we have an ndarray - kwargs[k] = obj[[i]] + if obj.ndim == 2: + kwargs[k] = obj[[i]] if hasattr(arr, "tz") and arr.tz is None: # type: ignore[union-attr] # DatetimeArray needs to be converted to ndarray for DatetimeBlock @@ -420,15 +427,21 @@ def apply_with_block(self: T, f, align_keys=None, **kwargs) -> T: elif arr.dtype.kind == "m" and not isinstance(arr, np.ndarray): # TimedeltaArray needs to be converted to ndarray for TimedeltaBlock arr = arr._data # type: ignore[union-attr] - if isinstance(arr, np.ndarray): - arr = np.atleast_2d(arr) - block = make_block(arr, placement=slice(0, 1, 1), ndim=2) + + if self.ndim == 2: + if isinstance(arr, np.ndarray): + arr = np.atleast_2d(arr) + block = make_block(arr, placement=slice(0, 1, 1), ndim=2) + else: + block = make_block(arr, placement=slice(0, len(self), 1), ndim=1) + applied = getattr(block, f)(**kwargs) if isinstance(applied, list): applied = applied[0] arr = applied.values - if isinstance(arr, np.ndarray): - arr = arr[0, :] + if self.ndim == 2: + if isinstance(arr, np.ndarray): + arr = arr[0, :] result_arrays.append(arr) return type(self)(result_arrays, self._axes) @@ -459,7 +472,6 @@ def where(self, other, cond, align: bool, errors: str, axis: int) -> ArrayManage # return self.apply_with_block("setitem", indexer=indexer, value=value) def putmask(self, mask, new, align: bool = True): - if align: align_keys = ["new", "mask"] else: @@ -734,16 +746,12 @@ def fast_xs(self, loc: int) -> ArrayLike: result = np.array(values, dtype=dtype) return result - def iget(self, i: int) -> SingleBlockManager: + def iget(self, i: int) -> SingleArrayManager: """ - Return the data as a SingleBlockManager. + Return the data as a SingleArrayManager. """ - from pandas.core.internals.managers import SingleBlockManager - values = self.arrays[i] - block = make_block(values, placement=slice(0, len(values)), ndim=1) - - return SingleBlockManager(block, self._axes[0]) + return SingleArrayManager([values], [self._axes[0]]) def iget_values(self, i: int) -> ArrayLike: """ @@ -909,8 +917,8 @@ def _reindex_indexer( if not allow_dups: self._axes[axis]._validate_can_reindex(indexer) - # if axis >= self.ndim: - # raise IndexError("Requested axis not found in manager") + if axis >= self.ndim: + raise IndexError("Requested axis not found in manager") if axis == 1: new_arrays = [] @@ -1039,3 +1047,151 @@ def _interleaved_dtype(blocks) -> Optional[DtypeObj]: return None return find_common_type([b.dtype for b in blocks]) + + +class SingleArrayManager(ArrayManager, SingleDataManager): + + __slots__ = [ + "_axes", # private attribute, because 'axes' has different order, see below + "arrays", + ] + + arrays: List[Union[np.ndarray, ExtensionArray]] + _axes: List[Index] + + ndim = 1 + + def __init__( + self, + arrays: List[Union[np.ndarray, ExtensionArray]], + axes: List[Index], + verify_integrity: bool = True, + ): + self._axes = axes + self.arrays = arrays + + if verify_integrity: + assert len(axes) == 1 + assert len(arrays) == 1 + self._axes = [ensure_index(ax) for ax in self._axes] + arr = arrays[0] + arr = ensure_wrapped_if_datetimelike(arr) + if isinstance(arr, ABCPandasArray): + arr = arr.to_numpy() + self.arrays = [arr] + self._verify_integrity() + + def _verify_integrity(self) -> None: + (n_rows,) = self.shape + assert len(self.arrays) == 1 + assert len(self.arrays[0]) == n_rows + + @staticmethod + def _normalize_axis(axis): + return axis + + def make_empty(self, axes=None) -> SingleArrayManager: + """Return an empty ArrayManager with index/array of length 0""" + if axes is None: + axes = [Index([], dtype=object)] + array = np.array([], dtype=self.dtype) + return type(self)([array], axes) + + @classmethod + def from_array(cls, array, index): + return cls([array], [index]) + + @property + def axes(self): + return self._axes + + @property + def index(self) -> Index: + return self._axes[0] + + @property + def array(self): + return self.arrays[0] + + @property + def dtype(self): + return self.array.dtype + + def external_values(self): + """The array that Series.values returns""" + if isinstance(self.array, (PeriodArray, IntervalArray)): + return self.array.astype(object) + elif isinstance(self.array, (DatetimeArray, TimedeltaArray)): + return self.array._data + else: + return self.array + + def internal_values(self): + """The array that Series._values returns""" + return self.array + + def array_values(self): + """The array that Series.array returns""" + arr = self.array + if isinstance(arr, np.ndarray): + arr = PandasArray(arr) + return arr + + @property + def _can_hold_na(self) -> bool: + if isinstance(self.array, np.ndarray): + return self.array.dtype.kind not in ["b", "i", "u"] + else: + # ExtensionArray + return self.array._can_hold_na + + @property + def is_single_block(self) -> bool: + return True + + def _consolidate_check(self): + pass + + def get_slice(self, slobj: slice, axis: int = 0) -> SingleArrayManager: + if axis >= self.ndim: + raise IndexError("Requested axis not found in manager") + + new_array = self.array[slobj] + new_index = self.index[slobj] + return type(self)([new_array], [new_index]) + + def apply(self, func, **kwargs): + if callable(func): + new_array = func(self.array, **kwargs) + else: + new_array = getattr(self.array, func)(**kwargs) + return type(self)([new_array], self._axes) + + def setitem(self, indexer, value): + return self.apply_with_block("setitem", indexer=indexer, value=value) + + def idelete(self, indexer): + """ + Delete selected locations in-place (new array, same ArrayManager) + """ + to_keep = np.ones(self.shape[0], dtype=np.bool_) + to_keep[indexer] = False + + self.arrays = [self.arrays[0][to_keep]] + self._axes = [self._axes[0][to_keep]] + + def _get_data_subset(self, predicate: Callable) -> ArrayManager: + # used in get_numeric_data / get_bool_data + if predicate(self.array): + return type(self)(self.arrays, self._axes, verify_integrity=False) + else: + return self.make_empty() + + def set_values(self, values: ArrayLike): + """ + Set (replace) the values of the SingleArrayManager in place. + + Use at your own risk! This does not check if the passed values are + valid for the current SingleArrayManager (length, dtype, etc). + """ + self.arrays[0] = values diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py index 2ce91134f61d6..0e4b5ce2e7452 100644 --- a/pandas/core/internals/base.py +++ b/pandas/core/internals/base.py @@ -98,3 +98,7 @@ def equals(self, other: object) -> bool: return False return self._equal_values(other) + + +class SingleDataManager(DataManager): + ndim = 1 diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 09559e571d5ee..b3a60d7d1ef45 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -61,7 +61,10 @@ Index, ensure_index, ) -from pandas.core.internals.base import DataManager +from pandas.core.internals.base import ( + DataManager, + SingleDataManager, +) from pandas.core.internals.blocks import ( Block, CategoricalBlock, @@ -1525,7 +1528,7 @@ def unstack(self, unstacker, fill_value) -> BlockManager: return bm -class SingleBlockManager(BlockManager): +class SingleBlockManager(BlockManager, SingleDataManager): """ manage a single block with """ ndim = 1 @@ -1617,6 +1620,10 @@ def internal_values(self): """The array that Series._values returns""" return self._block.internal_values() + def array_values(self): + """The array that Series.array returns""" + return self._block.array_values() + @property def _can_hold_na(self) -> bool: return self._block._can_hold_na diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index a0dfb1c83a70b..288668b700ad0 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1742,10 +1742,12 @@ def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike: if isinstance(values, np.ndarray): result = result.view(orig_dtype) else: - # DatetimeArray + # DatetimeArray/TimedeltaArray # TODO: have this case go through a DTA method? + # For DatetimeTZDtype, view result as M8[ns] + npdtype = orig_dtype if isinstance(orig_dtype, np.dtype) else "M8[ns]" result = type(values)._simple_new( # type: ignore[attr-defined] - result.view("M8[ns]"), dtype=orig_dtype + result.view(npdtype), dtype=orig_dtype ) elif skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)): diff --git a/pandas/core/series.py b/pandas/core/series.py index 24c356e7a8269..5a5d1c44b312c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -43,6 +43,7 @@ FrameOrSeriesUnion, IndexKeyFunc, NpDtype, + SingleManager, StorageOptions, ValueKeyFunc, ) @@ -125,7 +126,10 @@ from pandas.core.indexes.period import PeriodIndex from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.core.indexing import check_bool_indexer -from pandas.core.internals import SingleBlockManager +from pandas.core.internals import ( + SingleArrayManager, + SingleBlockManager, +) from pandas.core.shared_docs import _shared_docs from pandas.core.sorting import ( ensure_key_mapped, @@ -267,7 +271,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): base.IndexOpsMixin.hasnans.func, doc=base.IndexOpsMixin.hasnans.__doc__ ) __hash__ = generic.NDFrame.__hash__ - _mgr: SingleBlockManager + _mgr: SingleManager div: Callable[[Series, Any], Series] rdiv: Callable[[Series, Any], Series] @@ -285,7 +289,7 @@ def __init__( ): if ( - isinstance(data, SingleBlockManager) + isinstance(data, (SingleBlockManager, SingleArrayManager)) and index is None and dtype is None and copy is False @@ -299,8 +303,12 @@ def __init__( if fastpath: # data is an ndarray, index is defined - if not isinstance(data, SingleBlockManager): - data = SingleBlockManager.from_array(data, index) + if not isinstance(data, (SingleBlockManager, SingleArrayManager)): + manager = get_option("mode.data_manager") + if manager == "block": + data = SingleBlockManager.from_array(data, index) + elif manager == "array": + data = SingleArrayManager.from_array(data, index) if copy: data = data.copy() if index is None: @@ -363,7 +371,7 @@ def __init__( data, index = self._init_dict(data, index, dtype) dtype = None copy = False - elif isinstance(data, SingleBlockManager): + elif isinstance(data, (SingleBlockManager, SingleArrayManager)): if index is None: index = data.index elif not data.index.equals(index) or copy: @@ -388,7 +396,7 @@ def __init__( com.require_length_match(data, index) # create/copy the manager - if isinstance(data, SingleBlockManager): + if isinstance(data, (SingleBlockManager, SingleArrayManager)): if dtype is not None: data = data.astype(dtype=dtype, errors="ignore", copy=copy) elif copy: @@ -396,7 +404,11 @@ def __init__( else: data = sanitize_array(data, index, dtype, copy) - data = SingleBlockManager.from_array(data, index) + manager = get_option("mode.data_manager") + if manager == "block": + data = SingleBlockManager.from_array(data, index) + elif manager == "array": + data = SingleArrayManager.from_array(data, index) generic.NDFrame.__init__(self, data) self.name = name @@ -657,7 +669,7 @@ def _values(self): @Appender(base.IndexOpsMixin.array.__doc__) # type: ignore[misc] @property def array(self) -> ExtensionArray: - return self._mgr._block.array_values() + return self._mgr.array_values() # ops def ravel(self, order="C"): diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index f873c93d90683..bb541739c7f44 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -5,6 +5,7 @@ lib, reduction as libreduction, ) +import pandas.util._test_decorators as td import pandas as pd from pandas import Series @@ -61,7 +62,13 @@ def cumsum_max(x): return 0 -@pytest.mark.parametrize("func", [cumsum_max, assert_block_lengths]) +@pytest.mark.parametrize( + "func", + [ + cumsum_max, + pytest.param(assert_block_lengths, marks=td.skip_array_manager_invalid_test), + ], +) def test_mgr_locs_updated(func): # https://github.com/pandas-dev/pandas/issues/31802 # Some operations may require creating new blocks, which requires diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index a69a693bb6203..ab484e7ae9d8a 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -13,6 +13,7 @@ iNaT, lib, ) +import pandas.util._test_decorators as td from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -650,6 +651,7 @@ def test_constructor_copy(self): assert x[0] == 2.0 assert y[0] == 1.0 + @td.skip_array_manager_invalid_test # TODO(ArrayManager) rewrite test @pytest.mark.parametrize( "index", [ @@ -1691,12 +1693,14 @@ def test_series_constructor_infer_multiindex(self): class TestSeriesConstructorInternals: - def test_constructor_no_pandas_array(self): + def test_constructor_no_pandas_array(self, using_array_manager): ser = Series([1, 2, 3]) result = Series(ser.array) tm.assert_series_equal(ser, result) - assert isinstance(result._mgr.blocks[0], NumericBlock) + if not using_array_manager: + assert isinstance(result._mgr.blocks[0], NumericBlock) + @td.skip_array_manager_invalid_test def test_from_array(self): result = Series(pd.array(["1H", "2H"], dtype="timedelta64[ns]")) assert result._mgr.blocks[0].is_extension is False @@ -1704,6 +1708,7 @@ def test_from_array(self): result = Series(pd.array(["2015"], dtype="datetime64[ns]")) assert result._mgr.blocks[0].is_extension is False + @td.skip_array_manager_invalid_test def test_from_list_dtype(self): result = Series(["1H", "2H"], dtype="timedelta64[ns]") assert result._mgr.blocks[0].is_extension is False