Skip to content

Commit 15fd7d7

Browse files
authored
PERF: new_block (#52326)
* PERF: new_block * fix arraymanager build
1 parent 523ab8c commit 15fd7d7

File tree

5 files changed

+63
-41
lines changed

5 files changed

+63
-41
lines changed

pandas/_libs/internals.pyx

+13-3
Original file line numberDiff line numberDiff line change
@@ -569,8 +569,14 @@ cpdef update_blklocs_and_blknos(
569569
def _unpickle_block(values, placement, ndim):
570570
# We have to do some gymnastics b/c "ndim" is keyword-only
571571

572-
from pandas.core.internals.blocks import new_block
573-
572+
from pandas.core.internals.blocks import (
573+
maybe_coerce_values,
574+
new_block,
575+
)
576+
values = maybe_coerce_values(values)
577+
578+
if not isinstance(placement, BlockPlacement):
579+
placement = BlockPlacement(placement)
574580
return new_block(values, placement, ndim=ndim)
575581

576582

@@ -795,6 +801,7 @@ cdef class BlockManager:
795801
from pandas.core.construction import extract_array
796802
from pandas.core.internals.blocks import (
797803
ensure_block_shape,
804+
maybe_coerce_values,
798805
new_block,
799806
)
800807
from pandas.core.internals.managers import ensure_index
@@ -808,7 +815,10 @@ cdef class BlockManager:
808815
vals = blk["values"]
809816
# older versions may hold e.g. DatetimeIndex instead of DTA
810817
vals = extract_array(vals, extract_numpy=True)
811-
blk["values"] = ensure_block_shape(vals, ndim=ndim)
818+
blk["values"] = maybe_coerce_values(ensure_block_shape(vals, ndim=ndim))
819+
820+
if not isinstance(blk["mgr_locs"], BlockPlacement):
821+
blk["mgr_locs"] = BlockPlacement(blk["mgr_locs"])
812822

813823
nbs = [
814824
new_block(blk["values"], blk["mgr_locs"], ndim=ndim)

pandas/core/internals/array_manager.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@
7878
interleaved_dtype,
7979
)
8080
from pandas.core.internals.blocks import (
81+
BlockPlacement,
8182
ensure_block_shape,
8283
external_values,
8384
extract_pandas_array,
@@ -290,11 +291,14 @@ def apply_with_block(
290291
# convert for the Block constructors.
291292
arr = np.asarray(arr)
292293

294+
arr = maybe_coerce_values(arr)
293295
if self.ndim == 2:
294296
arr = ensure_block_shape(arr, 2)
295-
block = new_block(arr, placement=slice(0, 1, 1), ndim=2)
297+
bp = BlockPlacement(slice(0, 1, 1))
298+
block = new_block(arr, placement=bp, ndim=2)
296299
else:
297-
block = new_block(arr, placement=slice(0, len(self), 1), ndim=1)
300+
bp = BlockPlacement(slice(0, len(self), 1))
301+
block = new_block(arr, placement=bp, ndim=1)
298302

299303
applied = getattr(block, f)(**kwargs)
300304
if isinstance(applied, list):

pandas/core/internals/blocks.py

+18-14
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,10 @@ def mgr_locs(self, new_mgr_locs: BlockPlacement) -> None:
211211

212212
@final
213213
def make_block(
214-
self, values, placement=None, refs: BlockValuesRefs | None = None
214+
self,
215+
values,
216+
placement: BlockPlacement | None = None,
217+
refs: BlockValuesRefs | None = None,
215218
) -> Block:
216219
"""
217220
Create a new block, with type inference propagate any values that are
@@ -222,8 +225,6 @@ def make_block(
222225
if self.is_extension:
223226
values = ensure_block_shape(values, ndim=self.ndim)
224227

225-
# TODO: perf by not going through new_block
226-
# We assume maybe_coerce_values has already been called
227228
return new_block(values, placement=placement, ndim=self.ndim, refs=refs)
228229

229230
@final
@@ -327,6 +328,7 @@ def apply(self, func, **kwargs) -> list[Block]:
327328
"""
328329
result = func(self.values, **kwargs)
329330

331+
result = maybe_coerce_values(result)
330332
return self._split_op_result(result)
331333

332334
@final
@@ -359,7 +361,8 @@ def _split_op_result(self, result: ArrayLike) -> list[Block]:
359361
else:
360362
vals = result[i]
361363

362-
block = self.make_block(values=vals, placement=loc)
364+
bp = BlockPlacement(loc)
365+
block = self.make_block(values=vals, placement=bp)
363366
nbs.append(block)
364367
return nbs
365368

@@ -454,6 +457,7 @@ def _downcast_2d(self, dtype, using_cow: bool = False) -> list[Block]:
454457
Refactored to allow use of maybe_split.
455458
"""
456459
new_values = maybe_downcast_to_dtype(self.values, dtype=dtype)
460+
new_values = maybe_coerce_values(new_values)
457461
refs = self.refs if using_cow and new_values is self.values else None
458462
return [self.make_block(new_values, refs=refs)]
459463

@@ -2292,6 +2296,7 @@ def convert(
22922296
refs = self.refs
22932297

22942298
res_values = ensure_block_shape(res_values, self.ndim)
2299+
res_values = maybe_coerce_values(res_values)
22952300
return [self.make_block(res_values, refs=refs)]
22962301

22972302

@@ -2373,18 +2378,17 @@ def new_block_2d(
23732378

23742379

23752380
def new_block(
2376-
values, placement, *, ndim: int, refs: BlockValuesRefs | None = None
2381+
values,
2382+
placement: BlockPlacement,
2383+
*,
2384+
ndim: int,
2385+
refs: BlockValuesRefs | None = None,
23772386
) -> Block:
2378-
# caller is responsible for ensuring values is NOT a PandasArray
2379-
2380-
if not isinstance(placement, BlockPlacement):
2381-
placement = BlockPlacement(placement)
2382-
2383-
check_ndim(values, placement, ndim)
2384-
2387+
# caller is responsible for ensuring:
2388+
# - values is NOT a PandasArray
2389+
# - check_ndim/ensure_block_shape already checked
2390+
# - maybe_coerce_values already called/unnecessary
23852391
klass = get_block_type(values.dtype)
2386-
2387-
values = maybe_coerce_values(values)
23882392
return klass(values, ndim=ndim, placement=placement, refs=refs)
23892393

23902394

pandas/core/internals/managers.py

+12-3
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@
7272
ensure_block_shape,
7373
extend_blocks,
7474
get_block_type,
75+
maybe_coerce_values,
7576
new_block,
7677
new_block_2d,
7778
)
@@ -1033,9 +1034,10 @@ def fast_xs(self, loc: int) -> SingleBlockManager:
10331034
# is this ruled out in the general case?
10341035
result = self.blocks[0].iget((slice(None), loc))
10351036
# in the case of a single block, the new block is a view
1037+
bp = BlockPlacement(slice(0, len(result)))
10361038
block = new_block(
10371039
result,
1038-
placement=slice(0, len(result)),
1040+
placement=bp,
10391041
ndim=1,
10401042
refs=self.blocks[0].refs,
10411043
)
@@ -1070,7 +1072,8 @@ def fast_xs(self, loc: int) -> SingleBlockManager:
10701072
dtype = cast(ExtensionDtype, dtype)
10711073
result = dtype.construct_array_type()._from_sequence(result, dtype=dtype)
10721074

1073-
block = new_block(result, placement=slice(0, len(result)), ndim=1)
1075+
bp = BlockPlacement(slice(0, len(result)))
1076+
block = new_block(result, placement=bp, ndim=1)
10741077
return SingleBlockManager(block, self.axes[0])
10751078

10761079
def iget(self, i: int, track_ref: bool = True) -> SingleBlockManager:
@@ -1886,7 +1889,9 @@ def from_array(
18861889
"""
18871890
Constructor for if we have an array that is not yet a Block.
18881891
"""
1889-
block = new_block(array, placement=slice(0, len(index)), ndim=1, refs=refs)
1892+
array = maybe_coerce_values(array)
1893+
bp = BlockPlacement(slice(0, len(index)))
1894+
block = new_block(array, placement=bp, ndim=1, refs=refs)
18901895
return cls(block, index)
18911896

18921897
def to_2d_mgr(self, columns: Index) -> BlockManager:
@@ -1932,6 +1937,10 @@ def unpickle_block(values, mgr_locs, ndim: int) -> Block:
19321937
# TODO(EA2D): ndim would be unnecessary with 2D EAs
19331938
# older pickles may store e.g. DatetimeIndex instead of DatetimeArray
19341939
values = extract_array(values, extract_numpy=True)
1940+
if not isinstance(mgr_locs, BlockPlacement):
1941+
mgr_locs = BlockPlacement(mgr_locs)
1942+
1943+
values = maybe_coerce_values(values)
19351944
return new_block(values, placement=mgr_locs, ndim=ndim)
19361945

19371946
if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]:

pandas/tests/internals/test_internals.py

+14-19
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
)
4141
from pandas.core.internals.blocks import (
4242
ensure_block_shape,
43+
maybe_coerce_values,
4344
new_block,
4445
)
4546

@@ -168,6 +169,7 @@ def create_block(typestr, placement, item_shape=None, num_offset=0, maker=new_bl
168169
else:
169170
raise ValueError(f'Unsupported typestr: "{typestr}"')
170171

172+
values = maybe_coerce_values(values)
171173
return maker(values, placement=placement, ndim=len(shape))
172174

173175

@@ -349,7 +351,7 @@ def test_delete_datetimelike(self):
349351
def test_split(self):
350352
# GH#37799
351353
values = np.random.randn(3, 4)
352-
blk = new_block(values, placement=[3, 1, 6], ndim=2)
354+
blk = new_block(values, placement=BlockPlacement([3, 1, 6]), ndim=2)
353355
result = blk._split()
354356

355357
# check that we get views, not copies
@@ -358,9 +360,9 @@ def test_split(self):
358360

359361
assert len(result) == 3
360362
expected = [
361-
new_block(values[[0]], placement=[3], ndim=2),
362-
new_block(values[[1]], placement=[1], ndim=2),
363-
new_block(values[[2]], placement=[6], ndim=2),
363+
new_block(values[[0]], placement=BlockPlacement([3]), ndim=2),
364+
new_block(values[[1]], placement=BlockPlacement([1]), ndim=2),
365+
new_block(values[[2]], placement=BlockPlacement([6]), ndim=2),
364366
]
365367
for res, exp in zip(result, expected):
366368
assert_block_equal(res, exp)
@@ -425,7 +427,7 @@ def test_iget(self):
425427
values = np.random.rand(3, 3)
426428
block = new_block(
427429
values=values.copy(),
428-
placement=np.arange(3, dtype=np.intp),
430+
placement=BlockPlacement(np.arange(3, dtype=np.intp)),
429431
ndim=values.ndim,
430432
)
431433
mgr = BlockManager(blocks=(block,), axes=[cols, Index(np.arange(3))])
@@ -629,13 +631,6 @@ def _compare(old_mgr, new_mgr):
629631
assert new_mgr.iget(7).dtype == np.float64
630632
assert new_mgr.iget(8).dtype == np.float16
631633

632-
def test_invalid_ea_block(self):
633-
with pytest.raises(ValueError, match="need to split"):
634-
create_mgr("a: category; b: category")
635-
636-
with pytest.raises(ValueError, match="need to split"):
637-
create_mgr("a: category2; b: category2")
638-
639634
def test_interleave(self):
640635
# self
641636
for dtype in ["f8", "i8", "object", "bool", "complex", "M8[ns]", "m8[ns]"]:
@@ -1301,7 +1296,7 @@ def test_datetime_block_can_hold_element(self):
13011296
def test_interval_can_hold_element_emptylist(self, dtype, element):
13021297
arr = np.array([1, 3, 4], dtype=dtype)
13031298
ii = IntervalIndex.from_breaks(arr)
1304-
blk = new_block(ii._data, [1], ndim=2)
1299+
blk = new_block(ii._data, BlockPlacement([1]), ndim=2)
13051300

13061301
assert blk._can_hold_element([])
13071302
# TODO: check this holds for all blocks
@@ -1310,7 +1305,7 @@ def test_interval_can_hold_element_emptylist(self, dtype, element):
13101305
def test_interval_can_hold_element(self, dtype, element):
13111306
arr = np.array([1, 3, 4, 9], dtype=dtype)
13121307
ii = IntervalIndex.from_breaks(arr)
1313-
blk = new_block(ii._data, [1], ndim=2)
1308+
blk = new_block(ii._data, BlockPlacement([1]), ndim=2)
13141309

13151310
elem = element(ii)
13161311
self.check_series_setitem(elem, ii, True)
@@ -1335,7 +1330,7 @@ def test_interval_can_hold_element(self, dtype, element):
13351330

13361331
def test_period_can_hold_element_emptylist(self):
13371332
pi = period_range("2016", periods=3, freq="A")
1338-
blk = new_block(pi._data.reshape(1, 3), [1], ndim=2)
1333+
blk = new_block(pi._data.reshape(1, 3), BlockPlacement([1]), ndim=2)
13391334

13401335
assert blk._can_hold_element([])
13411336

@@ -1396,13 +1391,13 @@ def test_should_store_categorical(self):
13961391
assert not blk.should_store(np.asarray(cat))
13971392

13981393

1399-
def test_validate_ndim(block_maker):
1394+
def test_validate_ndim():
14001395
values = np.array([1.0, 2.0])
1401-
placement = slice(2)
1396+
placement = BlockPlacement(slice(2))
14021397
msg = r"Wrong number of dimensions. values.ndim != ndim \[1 != 2\]"
14031398

14041399
with pytest.raises(ValueError, match=msg):
1405-
block_maker(values, placement, ndim=2)
1400+
make_block(values, placement, ndim=2)
14061401

14071402

14081403
def test_block_shape():
@@ -1418,7 +1413,7 @@ def test_make_block_no_pandas_array(block_maker):
14181413
arr = pd.arrays.PandasArray(np.array([1, 2]))
14191414

14201415
# PandasArray, no dtype
1421-
result = block_maker(arr, slice(len(arr)), ndim=arr.ndim)
1416+
result = block_maker(arr, BlockPlacement(slice(len(arr))), ndim=arr.ndim)
14221417
assert result.dtype.kind in ["i", "u"]
14231418

14241419
if block_maker is make_block:

0 commit comments

Comments
 (0)