Skip to content

PERF: do ndim validation before Block.__init__ #40337

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 10, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 14 additions & 16 deletions pandas/core/internals/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,13 @@
from pandas._typing import Dtype

from pandas.core.dtypes.common import is_datetime64tz_dtype
from pandas.core.dtypes.dtypes import PandasDtype
from pandas.core.dtypes.generic import ABCPandasArray

from pandas.core.arrays import DatetimeArray
from pandas.core.internals.blocks import (
Block,
DatetimeTZBlock,
check_ndim,
extract_pandas_array,
get_block_type,
)

Expand All @@ -39,29 +39,28 @@ def make_block(
- Block.make_block_same_class
- Block.__init__
"""
if isinstance(values, ABCPandasArray):
# Ensure that we don't allow PandasArray / PandasDtype in internals.
# For now, blocks should be backed by ndarrays when possible.
values = values.to_numpy()
if ndim and ndim > 1:
# TODO(EA2D): special case not needed with 2D EAs
values = np.atleast_2d(values)

if isinstance(dtype, PandasDtype):
dtype = dtype.numpy_dtype
values, dtype = extract_pandas_array(values, dtype, ndim)

if klass is None:
dtype = dtype or values.dtype
klass = get_block_type(values, dtype)

elif klass is DatetimeTZBlock and not is_datetime64tz_dtype(values.dtype):
# TODO: This is no longer hit internally; does it need to be retained
# for e.g. pyarrow?
# pyarrow calls get here
values = DatetimeArray._simple_new(values, dtype=dtype)

if not isinstance(placement, BlockPlacement):
placement = BlockPlacement(placement)

ndim = _maybe_infer_ndim(values, placement, ndim)
check_ndim(values, placement, ndim)
return klass(values, ndim=ndim, placement=placement)


def _maybe_infer_ndim(values, placement: BlockPlacement, ndim: Optional[int]) -> int:
"""
If `ndim` is not provided, infer it from placment and values.
"""
if ndim is None:
# GH#38134 Block constructor now assumes ndim is not None
if not isinstance(values.dtype, np.dtype):
Expand All @@ -71,5 +70,4 @@ def make_block(
ndim = 2
else:
ndim = values.ndim

return klass(values, ndim=ndim, placement=placement)
return ndim
131 changes: 66 additions & 65 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
Callable,
List,
Optional,
Tuple,
Type,
Union,
cast,
Expand Down Expand Up @@ -47,7 +48,6 @@
)
from pandas.core.dtypes.common import (
is_categorical_dtype,
is_datetime64tz_dtype,
is_dtype_equal,
is_extension_array_dtype,
is_list_like,
Expand Down Expand Up @@ -162,21 +162,10 @@ def __init__(self, values, placement, ndim: int):
ndim : int
1 for SingleBlockManager/Series, 2 for BlockManager/DataFrame
"""
# TODO(EA2D): ndim will be unnecessary with 2D EAs
self.ndim = self._check_ndim(values, ndim)
self.ndim = ndim
self.mgr_locs = placement
self.values = self._maybe_coerce_values(values)

if self._validate_ndim and self.ndim and len(self.mgr_locs) != len(self.values):
raise ValueError(
f"Wrong number of items passed {len(self.values)}, "
f"placement implies {len(self.mgr_locs)}"
)

elif self.is_extension and self.ndim == 2 and len(self.mgr_locs) != 1:
# TODO(EA2D): check unnecessary with 2D EAs
raise AssertionError("block.size != values.size")

@classmethod
def _maybe_coerce_values(cls, values):
"""
Expand All @@ -192,43 +181,6 @@ def _maybe_coerce_values(cls, values):
"""
return values

def _check_ndim(self, values, ndim: int):
"""
ndim inference and validation.

Infers ndim from 'values' if not provided to __init__.
Validates that values.ndim and ndim are consistent if and only if
the class variable '_validate_ndim' is True.

Parameters
----------
values : array-like
ndim : int

Returns
-------
ndim : int

Raises
------
ValueError : the number of dimensions do not match
"""
assert isinstance(ndim, int) # GH#38134 enforce this

if self._validate_ndim:
if values.ndim != ndim:
raise ValueError(
"Wrong number of dimensions. "
f"values.ndim != ndim [{values.ndim} != {ndim}]"
)
elif values.ndim > ndim:
# ExtensionBlock
raise ValueError(
"Wrong number of dimensions. "
f"values.ndim > ndim [{values.ndim} > {ndim}]"
)
return ndim

@property
def _holder(self):
"""
Expand Down Expand Up @@ -378,7 +330,7 @@ def getitem_block(self, slicer, new_mgr_locs=None) -> Block:

new_values = self._slice(slicer)

if self._validate_ndim and new_values.ndim != self.ndim:
if new_values.ndim != self.values.ndim:
raise ValueError("Only same dim slicing is allowed")

return type(self)._simple_new(new_values, new_mgr_locs, self.ndim)
Expand Down Expand Up @@ -2262,10 +2214,68 @@ def get_block_type(values, dtype: Optional[Dtype] = None):
return cls


def new_block(
values, placement, klass=None, ndim=None, dtype: Optional[Dtype] = None
) -> Block:
# Ensure that we don't allow PandasArray / PandasDtype in internals.
def new_block(values, placement, *, ndim: int, klass=None) -> Block:

if not isinstance(placement, BlockPlacement):
placement = BlockPlacement(placement)

values, _ = extract_pandas_array(values, None, ndim)
check_ndim(values, placement, ndim)

if klass is None:
klass = get_block_type(values, values.dtype)

return klass(values, ndim=ndim, placement=placement)


def check_ndim(values, placement: BlockPlacement, ndim: int):
"""
ndim inference and validation.

Validates that values.ndim and ndim are consistent.
Validates that len(values) and len(placement) are consistent.

Parameters
----------
values : array-like
placement : BlockPlacement
ndim : int

Raises
------
ValueError : the number of dimensions do not match
"""

if values.ndim > ndim:
# Check for both np.ndarray and ExtensionArray
raise ValueError(
"Wrong number of dimensions. "
f"values.ndim > ndim [{values.ndim} > {ndim}]"
)

elif isinstance(values.dtype, np.dtype):
# TODO(EA2D): special case not needed with 2D EAs
if values.ndim != ndim:
raise ValueError(
"Wrong number of dimensions. "
f"values.ndim != ndim [{values.ndim} != {ndim}]"
)
if len(placement) != len(values):
raise ValueError(
f"Wrong number of items passed {len(values)}, "
f"placement implies {len(placement)}"
)
elif ndim == 2 and len(placement) != 1:
# TODO(EA2D): special case unnecessary with 2D EAs
raise AssertionError("block.size != values.size")


def extract_pandas_array(
values: ArrayLike, dtype: Optional[DtypeObj], ndim: int
) -> Tuple[ArrayLike, Optional[DtypeObj]]:
"""
Ensure that we don't allow PandasArray / PandasDtype in internals.
"""
# For now, blocks should be backed by ndarrays when possible.
if isinstance(values, ABCPandasArray):
values = values.to_numpy()
Expand All @@ -2276,16 +2286,7 @@ def new_block(
if isinstance(dtype, PandasDtype):
dtype = dtype.numpy_dtype

if klass is None:
dtype = dtype or values.dtype
klass = get_block_type(values, dtype)

elif klass is DatetimeTZBlock and not is_datetime64tz_dtype(values.dtype):
# TODO: This is no longer hit internally; does it need to be retained
# for e.g. pyarrow?
values = DatetimeArray._simple_new(values, dtype=dtype)

return klass(values, ndim=ndim, placement=placement)
return values, dtype


# -----------------------------------------------------------------
Expand Down
3 changes: 3 additions & 0 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1610,6 +1610,9 @@ def get_slice(self, slobj: slice, axis: int = 0) -> SingleBlockManager:

blk = self._block
array = blk._slice(slobj)
if array.ndim > blk.values.ndim:
# This will be caught by Series._get_values
raise ValueError("dimension-expanding indexing not allowed")
block = blk.make_block_same_class(array, placement=slice(0, len(array)))
return type(self)(block, self.index[slobj])

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def test_getitem_ndarray_3d(self, index, frame_or_series, indexer_sli):

msgs = []
if frame_or_series is Series and indexer_sli in [tm.setitem, tm.iloc]:
msgs.append(r"Wrong number of dimensions. values.ndim != ndim \[3 != 1\]")
msgs.append(r"Wrong number of dimensions. values.ndim > ndim \[3 > 1\]")
if frame_or_series is Series or indexer_sli is tm.iloc:
msgs.append(r"Buffer has wrong number of dimensions \(expected 1, got 3\)")
if indexer_sli is tm.loc or (
Expand Down
24 changes: 13 additions & 11 deletions pandas/tests/internals/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -1318,17 +1318,19 @@ def test_make_block_no_pandas_array(block_maker):
assert result.dtype.kind in ["i", "u"]
assert result.is_extension is False

# PandasArray, PandasDtype
result = block_maker(arr, slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim)
assert result.dtype.kind in ["i", "u"]
assert result.is_extension is False

# ndarray, PandasDtype
result = block_maker(
arr.to_numpy(), slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim
)
assert result.dtype.kind in ["i", "u"]
assert result.is_extension is False
if block_maker is make_block:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i guess could have the else path here as well to assert the public api?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is the pseudo-public api

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

right what i mean is what's the else path? (an internal path)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

correct. the internal path no longer takes the dtype kwd checked below

# PandasArray, PandasDtype
result = block_maker(arr, slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim)
assert result.dtype.kind in ["i", "u"]
assert result.is_extension is False

# new_block no longer taked dtype keyword
# ndarray, PandasDtype
result = block_maker(
arr.to_numpy(), slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim
)
assert result.dtype.kind in ["i", "u"]
assert result.is_extension is False


def test_single_block_manager_fastpath_deprecated():
Expand Down