Skip to content

REF: simplify operating-columnwise dispatch #40256

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Mar 16, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
204 changes: 76 additions & 128 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

from functools import wraps
import re
from typing import (
TYPE_CHECKING,
Expand Down Expand Up @@ -30,6 +31,7 @@
ArrayLike,
Dtype,
DtypeObj,
F,
Shape,
final,
)
Expand Down Expand Up @@ -121,6 +123,24 @@
_dtype_obj = np.dtype("object")


def maybe_split(meth: F) -> F:
"""
If we have a multi-column block, split and operate block-wise. Otherwise
use the original method.
"""

@wraps(meth)
def newfunc(self, *args, **kwargs) -> List[Block]:

if self.ndim == 1 or self.shape[0] == 1:
return meth(self, *args, **kwargs)
else:
# Split and operate column-by-column
return self.split_and_operate(meth, *args, **kwargs)

return cast(F, newfunc)


class Block(PandasObject):
"""
Canonical n-dimensional unit of homogeneous dtype contained in a pandas
Expand Down Expand Up @@ -441,17 +461,16 @@ def fillna(
# we can't process the value, but nothing to do
return [self] if inplace else [self.copy()]

# operate column-by-column
def f(mask, val, idx):
block = self.coerce_to_target_dtype(value)

# slice out our block
if idx is not None:
# i.e. self.ndim == 2
block = block.getitem_block(slice(idx, idx + 1))
return block.fillna(value, limit=limit, inplace=inplace, downcast=None)
elif self.ndim == 1 or self.shape[0] == 1:
blk = self.coerce_to_target_dtype(value)
# bc we have already cast, inplace=True may avoid an extra copy
return blk.fillna(value, limit=limit, inplace=True, downcast=None)

return self.split_and_operate(None, f, inplace)
else:
# operate column-by-column
return self.split_and_operate(
type(self).fillna, value, limit=limit, inplace=inplace, downcast=None
)

@final
def _split(self) -> List[Block]:
Expand All @@ -469,75 +488,27 @@ def _split(self) -> List[Block]:
return new_blocks

@final
def split_and_operate(
self, mask, f, inplace: bool, ignore_failures: bool = False
) -> List[Block]:
def split_and_operate(self, func, *args, **kwargs) -> List[Block]:
"""
split the block per-column, and apply the callable f
per-column, return a new block for each. Handle
masking which will not change a block unless needed.
Split the block and apply func column-by-column.

Parameters
----------
mask : 2-d boolean mask
f : callable accepting (1d-mask, 1d values, indexer)
inplace : bool
ignore_failures : bool, default False
func : Block method
*args
**kwargs

Returns
-------
list of blocks
List[Block]
"""
if mask is None:
mask = np.broadcast_to(True, shape=self.shape)

new_values = self.values

def make_a_block(nv, ref_loc):
if isinstance(nv, list):
assert len(nv) == 1, nv
assert isinstance(nv[0], Block)
block = nv[0]
else:
# Put back the dimension that was taken from it and make
# a block out of the result.
nv = ensure_block_shape(nv, ndim=self.ndim)
block = self.make_block(values=nv, placement=ref_loc)
return block

# ndim == 1
if self.ndim == 1:
if mask.any():
nv = f(mask, new_values, None)
else:
nv = new_values if inplace else new_values.copy()
block = make_a_block(nv, self._mgr_locs)
return [block]

# ndim > 1
new_blocks = []
for i, ref_loc in enumerate(self._mgr_locs):
m = mask[i]
v = new_values[i]

# need a new block
if m.any() or m.size == 0:
# Apply our function; we may ignore_failures if this is a
# reduction that is dropping nuisance columns GH#37827
try:
nv = f(m, v, i)
except TypeError:
if ignore_failures:
continue
else:
raise
else:
nv = v if inplace else v.copy()

block = make_a_block(nv, [ref_loc])
new_blocks.append(block)
assert self.ndim == 2 and self.shape[0] != 1

return new_blocks
res_blocks = []
for nb in self._split():
rbs = func(nb, *args, **kwargs)
res_blocks.extend(rbs)
return res_blocks

def _maybe_downcast(self, blocks: List[Block], downcast=None) -> List[Block]:

Expand Down Expand Up @@ -577,13 +548,17 @@ def downcast(self, dtypes=None) -> List[Block]:
elif dtypes != "infer":
raise AssertionError("dtypes as dict is not supported yet")

# operate column-by-column
# this is expensive as it splits the blocks items-by-item
def f(mask, val, idx):
val = maybe_downcast_to_dtype(val, dtype="infer")
return val
return self._downcast_2d()

return self.split_and_operate(None, f, False)
@maybe_split
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I find using a decorator & having _split very confusing.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hah i find split_and_operate (the non-decorator alternative) confusing. maybe it can be simplified now that putmask no longer uses it (so one or more of its args can be removed)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so i can re-write split_and_operate to be basically just the if self.ndim == 2 and self.shape[0] != 1 part of maybe_split, but then every method that is decorated in this PR looks like:

def foo(self, *args, **kwargs):
    if self.ndim == 2 and self.shape[0] != 1:
       return self._split_and_operate(Block.foo, *args, **kwargs)
    [...]

and avoiding that repetition seems like exactly the use case for a decorator

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tried this out and it gets more verbose bc we have to write out *args/**kwargs each time

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mixing these approaches just makes this more confusing. pick.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, will update shortly.

def _downcast_2d(self) -> List[Block]:
"""
downcast specialized to 2D case post-validation.

Refactored to allow use of maybe_split.
"""
new_values = maybe_downcast_to_dtype(self.values, dtype="infer")
return [self.make_block(new_values)]

@final
def astype(self, dtype, copy: bool = False, errors: str = "raise"):
Expand Down Expand Up @@ -712,18 +687,13 @@ def replace(
# bc _can_hold_element is incorrect.
return [self] if inplace else [self.copy()]

if not self._can_hold_element(value):
if self.ndim == 2 and self.shape[0] > 1:
# split so that we only upcast where necessary
nbs = self._split()
res_blocks = extend_blocks(
[
blk.replace(to_replace, value, inplace=inplace, regex=regex)
for blk in nbs
]
)
return res_blocks
elif self._can_hold_element(value):
blk = self if inplace else self.copy()
putmask_inplace(blk.values, mask, value)
blocks = blk.convert(numeric=False, copy=False)
return blocks

elif self.ndim == 1 or self.shape[0] == 1:
blk = self.coerce_to_target_dtype(value)
return blk.replace(
to_replace=to_replace,
Expand All @@ -732,10 +702,11 @@ def replace(
regex=regex,
)

blk = self if inplace else self.copy()
putmask_inplace(blk.values, mask, value)
blocks = blk.convert(numeric=False, copy=False)
return blocks
else:
# split so that we only upcast where necessary
return self.split_and_operate(
type(self).replace, to_replace, value, inplace=inplace, regex=regex
)

@final
def _replace_regex(
Expand Down Expand Up @@ -2025,6 +1996,8 @@ class ObjectBlock(Block):
is_object = True
_can_hold_na = True

values: np.ndarray

@property
def is_bool(self):
"""
Expand All @@ -2033,26 +2006,15 @@ def is_bool(self):
"""
return lib.is_bool_array(self.values.ravel("K"))

@maybe_split
def reduce(self, func, ignore_failures: bool = False) -> List[Block]:
"""
For object-dtype, we operate column-wise.
"""
assert self.ndim == 2

values = self.values
if len(values) > 1:
# split_and_operate expects func with signature (mask, values, inplace)
def mask_func(mask, values, inplace):
if values.ndim == 1:
values = values.reshape(1, -1)
return func(values)

return self.split_and_operate(
None, mask_func, False, ignore_failures=ignore_failures
)

try:
res = func(values)
res = func(self.values)
except TypeError:
if not ignore_failures:
raise
Expand All @@ -2063,6 +2025,7 @@ def mask_func(mask, values, inplace):
res = res.reshape(1, -1)
return [self.make_block_same_class(res)]

@maybe_split
def convert(
self,
copy: bool = True,
Expand All @@ -2074,30 +2037,15 @@ def convert(
attempt to cast any object types to better types return a copy of
the block (if copy = True) by definition we ARE an ObjectBlock!!!!!
"""

# operate column-by-column
def f(mask, val, idx):
shape = val.shape
values = soft_convert_objects(
val.ravel(),
datetime=datetime,
numeric=numeric,
timedelta=timedelta,
copy=copy,
)
if isinstance(values, np.ndarray):
# TODO(EA2D): allow EA once reshape is supported
values = values.reshape(shape)

return values

if self.ndim == 2:
blocks = self.split_and_operate(None, f, False)
else:
values = f(None, self.values.ravel(), None)
blocks = [self.make_block(values)]

return blocks
res_values = soft_convert_objects(
self.values.ravel(),
datetime=datetime,
numeric=numeric,
timedelta=timedelta,
copy=copy,
)
res_values = ensure_block_shape(res_values, self.ndim)
return [self.make_block(res_values)]

def _maybe_downcast(self, blocks: List[Block], downcast=None) -> List[Block]:

Expand Down