Skip to content

Commit 1f5e358

Browse files
authored
PERF: do ndim validation before Block.__init__ (#40337)
1 parent ac3e3f7 commit 1f5e358

File tree

5 files changed

+97
-93
lines changed

5 files changed

+97
-93
lines changed

pandas/core/internals/api.py

+14-16
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,13 @@
1414
from pandas._typing import Dtype
1515

1616
from pandas.core.dtypes.common import is_datetime64tz_dtype
17-
from pandas.core.dtypes.dtypes import PandasDtype
18-
from pandas.core.dtypes.generic import ABCPandasArray
1917

2018
from pandas.core.arrays import DatetimeArray
2119
from pandas.core.internals.blocks import (
2220
Block,
2321
DatetimeTZBlock,
22+
check_ndim,
23+
extract_pandas_array,
2424
get_block_type,
2525
)
2626

@@ -39,29 +39,28 @@ def make_block(
3939
- Block.make_block_same_class
4040
- Block.__init__
4141
"""
42-
if isinstance(values, ABCPandasArray):
43-
# Ensure that we don't allow PandasArray / PandasDtype in internals.
44-
# For now, blocks should be backed by ndarrays when possible.
45-
values = values.to_numpy()
46-
if ndim and ndim > 1:
47-
# TODO(EA2D): special case not needed with 2D EAs
48-
values = np.atleast_2d(values)
49-
50-
if isinstance(dtype, PandasDtype):
51-
dtype = dtype.numpy_dtype
42+
values, dtype = extract_pandas_array(values, dtype, ndim)
5243

5344
if klass is None:
5445
dtype = dtype or values.dtype
5546
klass = get_block_type(values, dtype)
5647

5748
elif klass is DatetimeTZBlock and not is_datetime64tz_dtype(values.dtype):
58-
# TODO: This is no longer hit internally; does it need to be retained
59-
# for e.g. pyarrow?
49+
# pyarrow calls get here
6050
values = DatetimeArray._simple_new(values, dtype=dtype)
6151

6252
if not isinstance(placement, BlockPlacement):
6353
placement = BlockPlacement(placement)
6454

55+
ndim = _maybe_infer_ndim(values, placement, ndim)
56+
check_ndim(values, placement, ndim)
57+
return klass(values, ndim=ndim, placement=placement)
58+
59+
60+
def _maybe_infer_ndim(values, placement: BlockPlacement, ndim: Optional[int]) -> int:
61+
"""
62+
If `ndim` is not provided, infer it from placment and values.
63+
"""
6564
if ndim is None:
6665
# GH#38134 Block constructor now assumes ndim is not None
6766
if not isinstance(values.dtype, np.dtype):
@@ -71,5 +70,4 @@ def make_block(
7170
ndim = 2
7271
else:
7372
ndim = values.ndim
74-
75-
return klass(values, ndim=ndim, placement=placement)
73+
return ndim

pandas/core/internals/blocks.py

+66-65
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
Callable,
88
List,
99
Optional,
10+
Tuple,
1011
Type,
1112
Union,
1213
cast,
@@ -47,7 +48,6 @@
4748
)
4849
from pandas.core.dtypes.common import (
4950
is_categorical_dtype,
50-
is_datetime64tz_dtype,
5151
is_dtype_equal,
5252
is_extension_array_dtype,
5353
is_list_like,
@@ -164,21 +164,10 @@ def __init__(self, values, placement, ndim: int):
164164
ndim : int
165165
1 for SingleBlockManager/Series, 2 for BlockManager/DataFrame
166166
"""
167-
# TODO(EA2D): ndim will be unnecessary with 2D EAs
168-
self.ndim = self._check_ndim(values, ndim)
167+
self.ndim = ndim
169168
self.mgr_locs = placement
170169
self.values = self._maybe_coerce_values(values)
171170

172-
if self._validate_ndim and self.ndim and len(self.mgr_locs) != len(self.values):
173-
raise ValueError(
174-
f"Wrong number of items passed {len(self.values)}, "
175-
f"placement implies {len(self.mgr_locs)}"
176-
)
177-
178-
elif self.is_extension and self.ndim == 2 and len(self.mgr_locs) != 1:
179-
# TODO(EA2D): check unnecessary with 2D EAs
180-
raise AssertionError("block.size != values.size")
181-
182171
@classmethod
183172
def _maybe_coerce_values(cls, values):
184173
"""
@@ -194,43 +183,6 @@ def _maybe_coerce_values(cls, values):
194183
"""
195184
return values
196185

197-
def _check_ndim(self, values, ndim: int):
198-
"""
199-
ndim inference and validation.
200-
201-
Infers ndim from 'values' if not provided to __init__.
202-
Validates that values.ndim and ndim are consistent if and only if
203-
the class variable '_validate_ndim' is True.
204-
205-
Parameters
206-
----------
207-
values : array-like
208-
ndim : int
209-
210-
Returns
211-
-------
212-
ndim : int
213-
214-
Raises
215-
------
216-
ValueError : the number of dimensions do not match
217-
"""
218-
assert isinstance(ndim, int) # GH#38134 enforce this
219-
220-
if self._validate_ndim:
221-
if values.ndim != ndim:
222-
raise ValueError(
223-
"Wrong number of dimensions. "
224-
f"values.ndim != ndim [{values.ndim} != {ndim}]"
225-
)
226-
elif values.ndim > ndim:
227-
# ExtensionBlock
228-
raise ValueError(
229-
"Wrong number of dimensions. "
230-
f"values.ndim > ndim [{values.ndim} > {ndim}]"
231-
)
232-
return ndim
233-
234186
@property
235187
def _holder(self):
236188
"""
@@ -384,7 +336,7 @@ def getitem_block(self, slicer, new_mgr_locs=None) -> Block:
384336

385337
new_values = self._slice(slicer)
386338

387-
if self._validate_ndim and new_values.ndim != self.ndim:
339+
if new_values.ndim != self.values.ndim:
388340
raise ValueError("Only same dim slicing is allowed")
389341

390342
return type(self)._simple_new(new_values, new_mgr_locs, self.ndim)
@@ -2337,10 +2289,68 @@ def get_block_type(values, dtype: Optional[Dtype] = None):
23372289
return cls
23382290

23392291

2340-
def new_block(
2341-
values, placement, klass=None, ndim=None, dtype: Optional[Dtype] = None
2342-
) -> Block:
2343-
# Ensure that we don't allow PandasArray / PandasDtype in internals.
2292+
def new_block(values, placement, *, ndim: int, klass=None) -> Block:
2293+
2294+
if not isinstance(placement, BlockPlacement):
2295+
placement = BlockPlacement(placement)
2296+
2297+
values, _ = extract_pandas_array(values, None, ndim)
2298+
check_ndim(values, placement, ndim)
2299+
2300+
if klass is None:
2301+
klass = get_block_type(values, values.dtype)
2302+
2303+
return klass(values, ndim=ndim, placement=placement)
2304+
2305+
2306+
def check_ndim(values, placement: BlockPlacement, ndim: int):
2307+
"""
2308+
ndim inference and validation.
2309+
2310+
Validates that values.ndim and ndim are consistent.
2311+
Validates that len(values) and len(placement) are consistent.
2312+
2313+
Parameters
2314+
----------
2315+
values : array-like
2316+
placement : BlockPlacement
2317+
ndim : int
2318+
2319+
Raises
2320+
------
2321+
ValueError : the number of dimensions do not match
2322+
"""
2323+
2324+
if values.ndim > ndim:
2325+
# Check for both np.ndarray and ExtensionArray
2326+
raise ValueError(
2327+
"Wrong number of dimensions. "
2328+
f"values.ndim > ndim [{values.ndim} > {ndim}]"
2329+
)
2330+
2331+
elif isinstance(values.dtype, np.dtype):
2332+
# TODO(EA2D): special case not needed with 2D EAs
2333+
if values.ndim != ndim:
2334+
raise ValueError(
2335+
"Wrong number of dimensions. "
2336+
f"values.ndim != ndim [{values.ndim} != {ndim}]"
2337+
)
2338+
if len(placement) != len(values):
2339+
raise ValueError(
2340+
f"Wrong number of items passed {len(values)}, "
2341+
f"placement implies {len(placement)}"
2342+
)
2343+
elif ndim == 2 and len(placement) != 1:
2344+
# TODO(EA2D): special case unnecessary with 2D EAs
2345+
raise AssertionError("block.size != values.size")
2346+
2347+
2348+
def extract_pandas_array(
2349+
values: ArrayLike, dtype: Optional[DtypeObj], ndim: int
2350+
) -> Tuple[ArrayLike, Optional[DtypeObj]]:
2351+
"""
2352+
Ensure that we don't allow PandasArray / PandasDtype in internals.
2353+
"""
23442354
# For now, blocks should be backed by ndarrays when possible.
23452355
if isinstance(values, ABCPandasArray):
23462356
values = values.to_numpy()
@@ -2351,16 +2361,7 @@ def new_block(
23512361
if isinstance(dtype, PandasDtype):
23522362
dtype = dtype.numpy_dtype
23532363

2354-
if klass is None:
2355-
dtype = dtype or values.dtype
2356-
klass = get_block_type(values, dtype)
2357-
2358-
elif klass is DatetimeTZBlock and not is_datetime64tz_dtype(values.dtype):
2359-
# TODO: This is no longer hit internally; does it need to be retained
2360-
# for e.g. pyarrow?
2361-
values = DatetimeArray._simple_new(values, dtype=dtype)
2362-
2363-
return klass(values, ndim=ndim, placement=placement)
2364+
return values, dtype
23642365

23652366

23662367
# -----------------------------------------------------------------

pandas/core/internals/managers.py

+3
Original file line numberDiff line numberDiff line change
@@ -1669,6 +1669,9 @@ def get_slice(self, slobj: slice, axis: int = 0) -> SingleBlockManager:
16691669

16701670
blk = self._block
16711671
array = blk._slice(slobj)
1672+
if array.ndim > blk.values.ndim:
1673+
# This will be caught by Series._get_values
1674+
raise ValueError("dimension-expanding indexing not allowed")
16721675
block = blk.make_block_same_class(array, placement=slice(0, len(array)))
16731676
new_index = self.index._getitem_slice(slobj)
16741677
return type(self)(block, new_index)

pandas/tests/indexing/test_indexing.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ def test_getitem_ndarray_3d(self, index, frame_or_series, indexer_sli):
7575

7676
msgs = []
7777
if frame_or_series is Series and indexer_sli in [tm.setitem, tm.iloc]:
78-
msgs.append(r"Wrong number of dimensions. values.ndim != ndim \[3 != 1\]")
78+
msgs.append(r"Wrong number of dimensions. values.ndim > ndim \[3 > 1\]")
7979
if frame_or_series is Series or indexer_sli is tm.iloc:
8080
msgs.append(r"Buffer has wrong number of dimensions \(expected 1, got 3\)")
8181
if indexer_sli is tm.loc or (

pandas/tests/internals/test_internals.py

+13-11
Original file line numberDiff line numberDiff line change
@@ -1327,17 +1327,19 @@ def test_make_block_no_pandas_array(block_maker):
13271327
assert result.dtype.kind in ["i", "u"]
13281328
assert result.is_extension is False
13291329

1330-
# PandasArray, PandasDtype
1331-
result = block_maker(arr, slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim)
1332-
assert result.dtype.kind in ["i", "u"]
1333-
assert result.is_extension is False
1334-
1335-
# ndarray, PandasDtype
1336-
result = block_maker(
1337-
arr.to_numpy(), slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim
1338-
)
1339-
assert result.dtype.kind in ["i", "u"]
1340-
assert result.is_extension is False
1330+
if block_maker is make_block:
1331+
# PandasArray, PandasDtype
1332+
result = block_maker(arr, slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim)
1333+
assert result.dtype.kind in ["i", "u"]
1334+
assert result.is_extension is False
1335+
1336+
# new_block no longer taked dtype keyword
1337+
# ndarray, PandasDtype
1338+
result = block_maker(
1339+
arr.to_numpy(), slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim
1340+
)
1341+
assert result.dtype.kind in ["i", "u"]
1342+
assert result.is_extension is False
13411343

13421344

13431345
def test_single_block_manager_fastpath_deprecated():

0 commit comments

Comments
 (0)