Skip to content

Commit 126bb92

Browse files
authored
REF: simplify operating-columnwise dispatch (pandas-dev#40256)
1 parent fefd999 commit 126bb92

File tree

1 file changed

+76
-128
lines changed

1 file changed

+76
-128
lines changed

pandas/core/internals/blocks.py

+76-128
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
from functools import wraps
34
import re
45
from typing import (
56
TYPE_CHECKING,
@@ -30,6 +31,7 @@
3031
ArrayLike,
3132
Dtype,
3233
DtypeObj,
34+
F,
3335
Shape,
3436
final,
3537
)
@@ -121,6 +123,24 @@
121123
_dtype_obj = np.dtype("object")
122124

123125

126+
def maybe_split(meth: F) -> F:
127+
"""
128+
If we have a multi-column block, split and operate block-wise. Otherwise
129+
use the original method.
130+
"""
131+
132+
@wraps(meth)
133+
def newfunc(self, *args, **kwargs) -> List[Block]:
134+
135+
if self.ndim == 1 or self.shape[0] == 1:
136+
return meth(self, *args, **kwargs)
137+
else:
138+
# Split and operate column-by-column
139+
return self.split_and_operate(meth, *args, **kwargs)
140+
141+
return cast(F, newfunc)
142+
143+
124144
class Block(PandasObject):
125145
"""
126146
Canonical n-dimensional unit of homogeneous dtype contained in a pandas
@@ -464,17 +484,16 @@ def fillna(
464484
# we can't process the value, but nothing to do
465485
return [self] if inplace else [self.copy()]
466486

467-
# operate column-by-column
468-
def f(mask, val, idx):
469-
block = self.coerce_to_target_dtype(value)
470-
471-
# slice out our block
472-
if idx is not None:
473-
# i.e. self.ndim == 2
474-
block = block.getitem_block(slice(idx, idx + 1))
475-
return block.fillna(value, limit=limit, inplace=inplace, downcast=None)
487+
elif self.ndim == 1 or self.shape[0] == 1:
488+
blk = self.coerce_to_target_dtype(value)
489+
# bc we have already cast, inplace=True may avoid an extra copy
490+
return blk.fillna(value, limit=limit, inplace=True, downcast=None)
476491

477-
return self.split_and_operate(None, f, inplace)
492+
else:
493+
# operate column-by-column
494+
return self.split_and_operate(
495+
type(self).fillna, value, limit=limit, inplace=inplace, downcast=None
496+
)
478497

479498
@final
480499
def _split(self) -> List[Block]:
@@ -492,75 +511,27 @@ def _split(self) -> List[Block]:
492511
return new_blocks
493512

494513
@final
495-
def split_and_operate(
496-
self, mask, f, inplace: bool, ignore_failures: bool = False
497-
) -> List[Block]:
514+
def split_and_operate(self, func, *args, **kwargs) -> List[Block]:
498515
"""
499-
split the block per-column, and apply the callable f
500-
per-column, return a new block for each. Handle
501-
masking which will not change a block unless needed.
516+
Split the block and apply func column-by-column.
502517
503518
Parameters
504519
----------
505-
mask : 2-d boolean mask
506-
f : callable accepting (1d-mask, 1d values, indexer)
507-
inplace : bool
508-
ignore_failures : bool, default False
520+
func : Block method
521+
*args
522+
**kwargs
509523
510524
Returns
511525
-------
512-
list of blocks
526+
List[Block]
513527
"""
514-
if mask is None:
515-
mask = np.broadcast_to(True, shape=self.shape)
516-
517-
new_values = self.values
518-
519-
def make_a_block(nv, ref_loc):
520-
if isinstance(nv, list):
521-
assert len(nv) == 1, nv
522-
assert isinstance(nv[0], Block)
523-
block = nv[0]
524-
else:
525-
# Put back the dimension that was taken from it and make
526-
# a block out of the result.
527-
nv = ensure_block_shape(nv, ndim=self.ndim)
528-
block = self.make_block(values=nv, placement=ref_loc)
529-
return block
530-
531-
# ndim == 1
532-
if self.ndim == 1:
533-
if mask.any():
534-
nv = f(mask, new_values, None)
535-
else:
536-
nv = new_values if inplace else new_values.copy()
537-
block = make_a_block(nv, self._mgr_locs)
538-
return [block]
539-
540-
# ndim > 1
541-
new_blocks = []
542-
for i, ref_loc in enumerate(self._mgr_locs):
543-
m = mask[i]
544-
v = new_values[i]
545-
546-
# need a new block
547-
if m.any() or m.size == 0:
548-
# Apply our function; we may ignore_failures if this is a
549-
# reduction that is dropping nuisance columns GH#37827
550-
try:
551-
nv = f(m, v, i)
552-
except TypeError:
553-
if ignore_failures:
554-
continue
555-
else:
556-
raise
557-
else:
558-
nv = v if inplace else v.copy()
559-
560-
block = make_a_block(nv, [ref_loc])
561-
new_blocks.append(block)
528+
assert self.ndim == 2 and self.shape[0] != 1
562529

563-
return new_blocks
530+
res_blocks = []
531+
for nb in self._split():
532+
rbs = func(nb, *args, **kwargs)
533+
res_blocks.extend(rbs)
534+
return res_blocks
564535

565536
def _maybe_downcast(self, blocks: List[Block], downcast=None) -> List[Block]:
566537

@@ -600,13 +571,17 @@ def downcast(self, dtypes=None) -> List[Block]:
600571
elif dtypes != "infer":
601572
raise AssertionError("dtypes as dict is not supported yet")
602573

603-
# operate column-by-column
604-
# this is expensive as it splits the blocks items-by-item
605-
def f(mask, val, idx):
606-
val = maybe_downcast_to_dtype(val, dtype="infer")
607-
return val
574+
return self._downcast_2d()
608575

609-
return self.split_and_operate(None, f, False)
576+
@maybe_split
577+
def _downcast_2d(self) -> List[Block]:
578+
"""
579+
downcast specialized to 2D case post-validation.
580+
581+
Refactored to allow use of maybe_split.
582+
"""
583+
new_values = maybe_downcast_to_dtype(self.values, dtype="infer")
584+
return [self.make_block(new_values)]
610585

611586
@final
612587
def astype(self, dtype, copy: bool = False, errors: str = "raise"):
@@ -735,18 +710,13 @@ def replace(
735710
# bc _can_hold_element is incorrect.
736711
return [self] if inplace else [self.copy()]
737712

738-
if not self._can_hold_element(value):
739-
if self.ndim == 2 and self.shape[0] > 1:
740-
# split so that we only upcast where necessary
741-
nbs = self._split()
742-
res_blocks = extend_blocks(
743-
[
744-
blk.replace(to_replace, value, inplace=inplace, regex=regex)
745-
for blk in nbs
746-
]
747-
)
748-
return res_blocks
713+
elif self._can_hold_element(value):
714+
blk = self if inplace else self.copy()
715+
putmask_inplace(blk.values, mask, value)
716+
blocks = blk.convert(numeric=False, copy=False)
717+
return blocks
749718

719+
elif self.ndim == 1 or self.shape[0] == 1:
750720
blk = self.coerce_to_target_dtype(value)
751721
return blk.replace(
752722
to_replace=to_replace,
@@ -755,10 +725,11 @@ def replace(
755725
regex=regex,
756726
)
757727

758-
blk = self if inplace else self.copy()
759-
putmask_inplace(blk.values, mask, value)
760-
blocks = blk.convert(numeric=False, copy=False)
761-
return blocks
728+
else:
729+
# split so that we only upcast where necessary
730+
return self.split_and_operate(
731+
type(self).replace, to_replace, value, inplace=inplace, regex=regex
732+
)
762733

763734
@final
764735
def _replace_regex(
@@ -2048,6 +2019,8 @@ class ObjectBlock(Block):
20482019
is_object = True
20492020
_can_hold_na = True
20502021

2022+
values: np.ndarray
2023+
20512024
@property
20522025
def is_bool(self):
20532026
"""
@@ -2056,26 +2029,15 @@ def is_bool(self):
20562029
"""
20572030
return lib.is_bool_array(self.values.ravel("K"))
20582031

2032+
@maybe_split
20592033
def reduce(self, func, ignore_failures: bool = False) -> List[Block]:
20602034
"""
20612035
For object-dtype, we operate column-wise.
20622036
"""
20632037
assert self.ndim == 2
20642038

2065-
values = self.values
2066-
if len(values) > 1:
2067-
# split_and_operate expects func with signature (mask, values, inplace)
2068-
def mask_func(mask, values, inplace):
2069-
if values.ndim == 1:
2070-
values = values.reshape(1, -1)
2071-
return func(values)
2072-
2073-
return self.split_and_operate(
2074-
None, mask_func, False, ignore_failures=ignore_failures
2075-
)
2076-
20772039
try:
2078-
res = func(values)
2040+
res = func(self.values)
20792041
except TypeError:
20802042
if not ignore_failures:
20812043
raise
@@ -2086,6 +2048,7 @@ def mask_func(mask, values, inplace):
20862048
res = res.reshape(1, -1)
20872049
return [self.make_block_same_class(res)]
20882050

2051+
@maybe_split
20892052
def convert(
20902053
self,
20912054
copy: bool = True,
@@ -2097,30 +2060,15 @@ def convert(
20972060
attempt to cast any object types to better types return a copy of
20982061
the block (if copy = True) by definition we ARE an ObjectBlock!!!!!
20992062
"""
2100-
2101-
# operate column-by-column
2102-
def f(mask, val, idx):
2103-
shape = val.shape
2104-
values = soft_convert_objects(
2105-
val.ravel(),
2106-
datetime=datetime,
2107-
numeric=numeric,
2108-
timedelta=timedelta,
2109-
copy=copy,
2110-
)
2111-
if isinstance(values, np.ndarray):
2112-
# TODO(EA2D): allow EA once reshape is supported
2113-
values = values.reshape(shape)
2114-
2115-
return values
2116-
2117-
if self.ndim == 2:
2118-
blocks = self.split_and_operate(None, f, False)
2119-
else:
2120-
values = f(None, self.values.ravel(), None)
2121-
blocks = [self.make_block(values)]
2122-
2123-
return blocks
2063+
res_values = soft_convert_objects(
2064+
self.values.ravel(),
2065+
datetime=datetime,
2066+
numeric=numeric,
2067+
timedelta=timedelta,
2068+
copy=copy,
2069+
)
2070+
res_values = ensure_block_shape(res_values, self.ndim)
2071+
return [self.make_block(res_values)]
21242072

21252073
def _maybe_downcast(self, blocks: List[Block], downcast=None) -> List[Block]:
21262074

0 commit comments

Comments
 (0)