diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 88c95331cd393..12b5ce61195ea 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -569,8 +569,14 @@ cpdef update_blklocs_and_blknos( def _unpickle_block(values, placement, ndim): # We have to do some gymnastics b/c "ndim" is keyword-only - from pandas.core.internals.blocks import new_block - + from pandas.core.internals.blocks import ( + maybe_coerce_values, + new_block, + ) + values = maybe_coerce_values(values) + + if not isinstance(placement, BlockPlacement): + placement = BlockPlacement(placement) return new_block(values, placement, ndim=ndim) @@ -795,6 +801,7 @@ cdef class BlockManager: from pandas.core.construction import extract_array from pandas.core.internals.blocks import ( ensure_block_shape, + maybe_coerce_values, new_block, ) from pandas.core.internals.managers import ensure_index @@ -808,7 +815,10 @@ cdef class BlockManager: vals = blk["values"] # older versions may hold e.g. DatetimeIndex instead of DTA vals = extract_array(vals, extract_numpy=True) - blk["values"] = ensure_block_shape(vals, ndim=ndim) + blk["values"] = maybe_coerce_values(ensure_block_shape(vals, ndim=ndim)) + + if not isinstance(blk["mgr_locs"], BlockPlacement): + blk["mgr_locs"] = BlockPlacement(blk["mgr_locs"]) nbs = [ new_block(blk["values"], blk["mgr_locs"], ndim=ndim) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index a692ad6afe92c..67fa6241b8c3e 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -78,6 +78,7 @@ interleaved_dtype, ) from pandas.core.internals.blocks import ( + BlockPlacement, ensure_block_shape, external_values, extract_pandas_array, @@ -290,11 +291,14 @@ def apply_with_block( # convert for the Block constructors. arr = np.asarray(arr) + arr = maybe_coerce_values(arr) if self.ndim == 2: arr = ensure_block_shape(arr, 2) - block = new_block(arr, placement=slice(0, 1, 1), ndim=2) + bp = BlockPlacement(slice(0, 1, 1)) + block = new_block(arr, placement=bp, ndim=2) else: - block = new_block(arr, placement=slice(0, len(self), 1), ndim=1) + bp = BlockPlacement(slice(0, len(self), 1)) + block = new_block(arr, placement=bp, ndim=1) applied = getattr(block, f)(**kwargs) if isinstance(applied, list): diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 6dcb73f6793ad..dd616015dd550 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -211,7 +211,10 @@ def mgr_locs(self, new_mgr_locs: BlockPlacement) -> None: @final def make_block( - self, values, placement=None, refs: BlockValuesRefs | None = None + self, + values, + placement: BlockPlacement | None = None, + refs: BlockValuesRefs | None = None, ) -> Block: """ Create a new block, with type inference propagate any values that are @@ -222,8 +225,6 @@ def make_block( if self.is_extension: values = ensure_block_shape(values, ndim=self.ndim) - # TODO: perf by not going through new_block - # We assume maybe_coerce_values has already been called return new_block(values, placement=placement, ndim=self.ndim, refs=refs) @final @@ -327,6 +328,7 @@ def apply(self, func, **kwargs) -> list[Block]: """ result = func(self.values, **kwargs) + result = maybe_coerce_values(result) return self._split_op_result(result) @final @@ -359,7 +361,8 @@ def _split_op_result(self, result: ArrayLike) -> list[Block]: else: vals = result[i] - block = self.make_block(values=vals, placement=loc) + bp = BlockPlacement(loc) + block = self.make_block(values=vals, placement=bp) nbs.append(block) return nbs @@ -454,6 +457,7 @@ def _downcast_2d(self, dtype, using_cow: bool = False) -> list[Block]: Refactored to allow use of maybe_split. """ new_values = maybe_downcast_to_dtype(self.values, dtype=dtype) + new_values = maybe_coerce_values(new_values) refs = self.refs if using_cow and new_values is self.values else None return [self.make_block(new_values, refs=refs)] @@ -2292,6 +2296,7 @@ def convert( refs = self.refs res_values = ensure_block_shape(res_values, self.ndim) + res_values = maybe_coerce_values(res_values) return [self.make_block(res_values, refs=refs)] @@ -2373,18 +2378,17 @@ def new_block_2d( def new_block( - values, placement, *, ndim: int, refs: BlockValuesRefs | None = None + values, + placement: BlockPlacement, + *, + ndim: int, + refs: BlockValuesRefs | None = None, ) -> Block: - # caller is responsible for ensuring values is NOT a PandasArray - - if not isinstance(placement, BlockPlacement): - placement = BlockPlacement(placement) - - check_ndim(values, placement, ndim) - + # caller is responsible for ensuring: + # - values is NOT a PandasArray + # - check_ndim/ensure_block_shape already checked + # - maybe_coerce_values already called/unnecessary klass = get_block_type(values.dtype) - - values = maybe_coerce_values(values) return klass(values, ndim=ndim, placement=placement, refs=refs) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index ed1a9b193b3e4..0b05e3b7633d9 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -72,6 +72,7 @@ ensure_block_shape, extend_blocks, get_block_type, + maybe_coerce_values, new_block, new_block_2d, ) @@ -1033,9 +1034,10 @@ def fast_xs(self, loc: int) -> SingleBlockManager: # is this ruled out in the general case? result = self.blocks[0].iget((slice(None), loc)) # in the case of a single block, the new block is a view + bp = BlockPlacement(slice(0, len(result))) block = new_block( result, - placement=slice(0, len(result)), + placement=bp, ndim=1, refs=self.blocks[0].refs, ) @@ -1070,7 +1072,8 @@ def fast_xs(self, loc: int) -> SingleBlockManager: dtype = cast(ExtensionDtype, dtype) result = dtype.construct_array_type()._from_sequence(result, dtype=dtype) - block = new_block(result, placement=slice(0, len(result)), ndim=1) + bp = BlockPlacement(slice(0, len(result))) + block = new_block(result, placement=bp, ndim=1) return SingleBlockManager(block, self.axes[0]) def iget(self, i: int, track_ref: bool = True) -> SingleBlockManager: @@ -1886,7 +1889,9 @@ def from_array( """ Constructor for if we have an array that is not yet a Block. """ - block = new_block(array, placement=slice(0, len(index)), ndim=1, refs=refs) + array = maybe_coerce_values(array) + bp = BlockPlacement(slice(0, len(index))) + block = new_block(array, placement=bp, ndim=1, refs=refs) return cls(block, index) def to_2d_mgr(self, columns: Index) -> BlockManager: @@ -1932,6 +1937,10 @@ def unpickle_block(values, mgr_locs, ndim: int) -> Block: # TODO(EA2D): ndim would be unnecessary with 2D EAs # older pickles may store e.g. DatetimeIndex instead of DatetimeArray values = extract_array(values, extract_numpy=True) + if not isinstance(mgr_locs, BlockPlacement): + mgr_locs = BlockPlacement(mgr_locs) + + values = maybe_coerce_values(values) return new_block(values, placement=mgr_locs, ndim=ndim) if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]: diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index c14178a3e122e..79eb4110cfba9 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -40,6 +40,7 @@ ) from pandas.core.internals.blocks import ( ensure_block_shape, + maybe_coerce_values, new_block, ) @@ -168,6 +169,7 @@ def create_block(typestr, placement, item_shape=None, num_offset=0, maker=new_bl else: raise ValueError(f'Unsupported typestr: "{typestr}"') + values = maybe_coerce_values(values) return maker(values, placement=placement, ndim=len(shape)) @@ -349,7 +351,7 @@ def test_delete_datetimelike(self): def test_split(self): # GH#37799 values = np.random.randn(3, 4) - blk = new_block(values, placement=[3, 1, 6], ndim=2) + blk = new_block(values, placement=BlockPlacement([3, 1, 6]), ndim=2) result = blk._split() # check that we get views, not copies @@ -358,9 +360,9 @@ def test_split(self): assert len(result) == 3 expected = [ - new_block(values[[0]], placement=[3], ndim=2), - new_block(values[[1]], placement=[1], ndim=2), - new_block(values[[2]], placement=[6], ndim=2), + new_block(values[[0]], placement=BlockPlacement([3]), ndim=2), + new_block(values[[1]], placement=BlockPlacement([1]), ndim=2), + new_block(values[[2]], placement=BlockPlacement([6]), ndim=2), ] for res, exp in zip(result, expected): assert_block_equal(res, exp) @@ -425,7 +427,7 @@ def test_iget(self): values = np.random.rand(3, 3) block = new_block( values=values.copy(), - placement=np.arange(3, dtype=np.intp), + placement=BlockPlacement(np.arange(3, dtype=np.intp)), ndim=values.ndim, ) mgr = BlockManager(blocks=(block,), axes=[cols, Index(np.arange(3))]) @@ -629,13 +631,6 @@ def _compare(old_mgr, new_mgr): assert new_mgr.iget(7).dtype == np.float64 assert new_mgr.iget(8).dtype == np.float16 - def test_invalid_ea_block(self): - with pytest.raises(ValueError, match="need to split"): - create_mgr("a: category; b: category") - - with pytest.raises(ValueError, match="need to split"): - create_mgr("a: category2; b: category2") - def test_interleave(self): # self for dtype in ["f8", "i8", "object", "bool", "complex", "M8[ns]", "m8[ns]"]: @@ -1301,7 +1296,7 @@ def test_datetime_block_can_hold_element(self): def test_interval_can_hold_element_emptylist(self, dtype, element): arr = np.array([1, 3, 4], dtype=dtype) ii = IntervalIndex.from_breaks(arr) - blk = new_block(ii._data, [1], ndim=2) + blk = new_block(ii._data, BlockPlacement([1]), ndim=2) assert blk._can_hold_element([]) # TODO: check this holds for all blocks @@ -1310,7 +1305,7 @@ def test_interval_can_hold_element_emptylist(self, dtype, element): def test_interval_can_hold_element(self, dtype, element): arr = np.array([1, 3, 4, 9], dtype=dtype) ii = IntervalIndex.from_breaks(arr) - blk = new_block(ii._data, [1], ndim=2) + blk = new_block(ii._data, BlockPlacement([1]), ndim=2) elem = element(ii) self.check_series_setitem(elem, ii, True) @@ -1335,7 +1330,7 @@ def test_interval_can_hold_element(self, dtype, element): def test_period_can_hold_element_emptylist(self): pi = period_range("2016", periods=3, freq="A") - blk = new_block(pi._data.reshape(1, 3), [1], ndim=2) + blk = new_block(pi._data.reshape(1, 3), BlockPlacement([1]), ndim=2) assert blk._can_hold_element([]) @@ -1396,13 +1391,13 @@ def test_should_store_categorical(self): assert not blk.should_store(np.asarray(cat)) -def test_validate_ndim(block_maker): +def test_validate_ndim(): values = np.array([1.0, 2.0]) - placement = slice(2) + placement = BlockPlacement(slice(2)) msg = r"Wrong number of dimensions. values.ndim != ndim \[1 != 2\]" with pytest.raises(ValueError, match=msg): - block_maker(values, placement, ndim=2) + make_block(values, placement, ndim=2) def test_block_shape(): @@ -1418,7 +1413,7 @@ def test_make_block_no_pandas_array(block_maker): arr = pd.arrays.PandasArray(np.array([1, 2])) # PandasArray, no dtype - result = block_maker(arr, slice(len(arr)), ndim=arr.ndim) + result = block_maker(arr, BlockPlacement(slice(len(arr))), ndim=arr.ndim) assert result.dtype.kind in ["i", "u"] if block_maker is make_block: