From c14fd9ee697e6804c0f7d30c27d5494e4f371ffd Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 8 Mar 2021 16:16:56 -0800 Subject: [PATCH 1/2] REF/PERF: do Block dimension validation earlier --- pandas/core/internals/api.py | 37 ++++--- pandas/core/internals/blocks.py | 133 ++++++++++------------- pandas/core/internals/managers.py | 2 + pandas/tests/indexing/test_indexing.py | 2 +- pandas/tests/internals/test_internals.py | 24 ++-- 5 files changed, 99 insertions(+), 99 deletions(-) diff --git a/pandas/core/internals/api.py b/pandas/core/internals/api.py index 3fbe324417c60..48ef78cd7f0f7 100644 --- a/pandas/core/internals/api.py +++ b/pandas/core/internals/api.py @@ -10,16 +10,17 @@ import numpy as np +from pandas._libs.internals import BlockPlacement from pandas._typing import Dtype from pandas.core.dtypes.common import is_datetime64tz_dtype -from pandas.core.dtypes.dtypes import PandasDtype -from pandas.core.dtypes.generic import ABCPandasArray from pandas.core.arrays import DatetimeArray from pandas.core.internals.blocks import ( Block, DatetimeTZBlock, + check_ndim, + extract_pandas_array, get_block_type, ) @@ -38,24 +39,32 @@ def make_block( - Block.make_block_same_class - Block.__init__ """ - if isinstance(values, ABCPandasArray): - # Ensure that we don't allow PandasArray / PandasDtype in internals. - # For now, blocks should be backed by ndarrays when possible. - values = values.to_numpy() - if ndim and ndim > 1: - # TODO(EA2D): special case not needed with 2D EAs - values = np.atleast_2d(values) - - if isinstance(dtype, PandasDtype): - dtype = dtype.numpy_dtype + values, dtype = extract_pandas_array(values, dtype, ndim) if klass is None: dtype = dtype or values.dtype klass = get_block_type(values, dtype) elif klass is DatetimeTZBlock and not is_datetime64tz_dtype(values.dtype): - # TODO: This is no longer hit internally; does it need to be retained - # for e.g. pyarrow? + # pyarrow calls get here values = DatetimeArray._simple_new(values, dtype=dtype) + if not isinstance(placement, BlockPlacement): + placement = BlockPlacement(placement) + + ndim = _maybe_infer_ndim(values, placement, ndim) + check_ndim(values, placement, ndim) return klass(values, ndim=ndim, placement=placement) + + +def _maybe_infer_ndim(values, placement: BlockPlacement, ndim: Optional[int]) -> int: + if ndim is None: + # GH#38134 Block constructor now assumes ndim is not None + if not isinstance(values.dtype, np.dtype): + if len(placement) != 1: + ndim = 1 + else: + ndim = 2 + else: + ndim = values.ndim + return ndim diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index e084db77692f5..78a1bead28e2d 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -47,7 +47,6 @@ ) from pandas.core.dtypes.common import ( is_categorical_dtype, - is_datetime64tz_dtype, is_dtype_equal, is_extension_array_dtype, is_list_like, @@ -159,17 +158,10 @@ def __init__(self, values, placement, ndim: int): ndim : int 1 for SingleBlockManager/Series, 2 for BlockManager/DataFrame """ - # TODO(EA2D): ndim will be unnecessary with 2D EAs - self.ndim = self._check_ndim(values, ndim) + self.ndim = ndim self.mgr_locs = placement self.values = self._maybe_coerce_values(values) - if self._validate_ndim and self.ndim and len(self.mgr_locs) != len(self.values): - raise ValueError( - f"Wrong number of items passed {len(self.values)}, " - f"placement implies {len(self.mgr_locs)}" - ) - @classmethod def _maybe_coerce_values(cls, values): """ @@ -185,44 +177,6 @@ def _maybe_coerce_values(cls, values): """ return values - def _check_ndim(self, values, ndim): - """ - ndim inference and validation. - - Infers ndim from 'values' if not provided to __init__. - Validates that values.ndim and ndim are consistent if and only if - the class variable '_validate_ndim' is True. - - Parameters - ---------- - values : array-like - ndim : int or None - - Returns - ------- - ndim : int - - Raises - ------ - ValueError : the number of dimensions do not match - """ - if ndim is None: - ndim = values.ndim - - if self._validate_ndim: - if values.ndim != ndim: - raise ValueError( - "Wrong number of dimensions. " - f"values.ndim != ndim [{values.ndim} != {ndim}]" - ) - elif values.ndim > ndim: - # ExtensionBlock - raise ValueError( - "Wrong number of dimensions. " - f"values.ndim > ndim [{values.ndim} > {ndim}]" - ) - return ndim - @property def _holder(self): """ @@ -372,7 +326,7 @@ def getitem_block(self, slicer, new_mgr_locs=None) -> Block: new_values = self._slice(slicer) - if self._validate_ndim and new_values.ndim != self.ndim: + if new_values.ndim != self.values.ndim: raise ValueError("Only same dim slicing is allowed") return type(self)._simple_new(new_values, new_mgr_locs, self.ndim) @@ -1475,18 +1429,6 @@ def __init__(self, values, placement, ndim: int): This will call continue to call __init__ for the other base classes mixed in with this Mixin. """ - - # Placement must be converted to BlockPlacement so that we can check - # its length - if not isinstance(placement, libinternals.BlockPlacement): - placement = libinternals.BlockPlacement(placement) - - # Maybe infer ndim from placement - if ndim is None: - if len(placement) != 1: - ndim = 1 - else: - ndim = 2 super().__init__(values, placement, ndim=ndim) if self.ndim == 2 and len(self.mgr_locs) != 1: @@ -2283,9 +2225,63 @@ def get_block_type(values, dtype: Optional[Dtype] = None): return cls -def new_block( - values, placement, klass=None, ndim=None, dtype: Optional[Dtype] = None -) -> Block: +def new_block(values, placement, *, ndim: int, klass=None) -> Block: + + if not isinstance(placement, BlockPlacement): + placement = BlockPlacement(placement) + + values, _ = extract_pandas_array(values, None, ndim) + check_ndim(values, placement, ndim) + + if klass is None: + klass = get_block_type(values, values.dtype) + + return klass(values, ndim=ndim, placement=placement) + + +def check_ndim(values, placement: BlockPlacement, ndim: int): + """ + ndim inference and validation. + + Validates that values.ndim and ndim are consistent. + Validates that len(values) and len(placement) are consistent. + + Parameters + ---------- + values : array-like + placement : BlockPlacement + ndim : int + + Raises + ------ + ValueError : the number of dimensions do not match + """ + + if values.ndim > ndim: + # Check for both np.ndarray and ExtensionArray + raise ValueError( + "Wrong number of dimensions. " + f"values.ndim > ndim [{values.ndim} > {ndim}]" + ) + + elif isinstance(values.dtype, np.dtype): + # TODO(EA2D): special case not needed with 2D EAs + if values.ndim != ndim: + raise ValueError( + "Wrong number of dimensions. " + f"values.ndim != ndim [{values.ndim} != {ndim}]" + ) + if len(placement) != len(values): + raise ValueError( + f"Wrong number of items passed {len(values)}, " + f"placement implies {len(placement)}" + ) + elif ndim == 2 and len(placement) != 1: + # TODO(EA2D): special case unnecessary with 2D EAs + raise AssertionError("block.size != values.size") + + +def extract_pandas_array(values, dtype, ndim): # Ensure that we don't allow PandasArray / PandasDtype in internals. # For now, blocks should be backed by ndarrays when possible. if isinstance(values, ABCPandasArray): @@ -2297,16 +2293,7 @@ def new_block( if isinstance(dtype, PandasDtype): dtype = dtype.numpy_dtype - if klass is None: - dtype = dtype or values.dtype - klass = get_block_type(values, dtype) - - elif klass is DatetimeTZBlock and not is_datetime64tz_dtype(values.dtype): - # TODO: This is no longer hit internally; does it need to be retained - # for e.g. pyarrow? - values = DatetimeArray._simple_new(values, dtype=dtype) - - return klass(values, ndim=ndim, placement=placement) + return values, dtype # ----------------------------------------------------------------- diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b656c9e83e1a8..faf43039941b5 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1610,6 +1610,8 @@ def get_slice(self, slobj: slice, axis: int = 0) -> SingleBlockManager: blk = self._block array = blk._slice(slobj) + if array.ndim > blk.values.ndim: + raise ValueError block = blk.make_block_same_class(array, placement=slice(0, len(array))) return type(self)(block, self.index[slobj]) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 7f0fed71ca5f1..880872bfa713a 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -75,7 +75,7 @@ def test_getitem_ndarray_3d(self, index, frame_or_series, indexer_sli): msgs = [] if frame_or_series is Series and indexer_sli in [tm.setitem, tm.iloc]: - msgs.append(r"Wrong number of dimensions. values.ndim != ndim \[3 != 1\]") + msgs.append(r"Wrong number of dimensions. values.ndim > ndim \[3 > 1\]") if frame_or_series is Series or indexer_sli is tm.iloc: msgs.append(r"Buffer has wrong number of dimensions \(expected 1, got 3\)") if indexer_sli is tm.loc or ( diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 683006d9b3b9c..6681fa958c7ad 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1318,17 +1318,19 @@ def test_make_block_no_pandas_array(block_maker): assert result.dtype.kind in ["i", "u"] assert result.is_extension is False - # PandasArray, PandasDtype - result = block_maker(arr, slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim) - assert result.dtype.kind in ["i", "u"] - assert result.is_extension is False - - # ndarray, PandasDtype - result = block_maker( - arr.to_numpy(), slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim - ) - assert result.dtype.kind in ["i", "u"] - assert result.is_extension is False + if block_maker is make_block: + # PandasArray, PandasDtype + result = block_maker(arr, slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim) + assert result.dtype.kind in ["i", "u"] + assert result.is_extension is False + + # new_block no longer taked dtype keyword + # ndarray, PandasDtype + result = block_maker( + arr.to_numpy(), slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim + ) + assert result.dtype.kind in ["i", "u"] + assert result.is_extension is False def test_single_block_manager_fastpath_deprecated(): From 23c6e6e0e464c42f32640f3a7ef17e317dff49e5 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 9 Mar 2021 15:58:59 -0800 Subject: [PATCH 2/2] docstring, exception message --- pandas/core/internals/blocks.py | 9 +++++++-- pandas/core/internals/managers.py | 3 ++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 09bc715850a1e..88c8122f17d34 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -7,6 +7,7 @@ Callable, List, Optional, + Tuple, Type, Union, cast, @@ -2269,8 +2270,12 @@ def check_ndim(values, placement: BlockPlacement, ndim: int): raise AssertionError("block.size != values.size") -def extract_pandas_array(values, dtype, ndim): - # Ensure that we don't allow PandasArray / PandasDtype in internals. +def extract_pandas_array( + values: ArrayLike, dtype: Optional[DtypeObj], ndim: int +) -> Tuple[ArrayLike, Optional[DtypeObj]]: + """ + Ensure that we don't allow PandasArray / PandasDtype in internals. + """ # For now, blocks should be backed by ndarrays when possible. if isinstance(values, ABCPandasArray): values = values.to_numpy() diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 6ee5a4fa551a0..1a2e8693b5097 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1611,7 +1611,8 @@ def get_slice(self, slobj: slice, axis: int = 0) -> SingleBlockManager: blk = self._block array = blk._slice(slobj) if array.ndim > blk.values.ndim: - raise ValueError + # This will be caught by Series._get_values + raise ValueError("dimension-expanding indexing not allowed") block = blk.make_block_same_class(array, placement=slice(0, len(array))) return type(self)(block, self.index[slobj])