Skip to content

REF: simplify operating-columnwise dispatch #40256

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Mar 16, 2021
Merged
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 58 additions & 53 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

from functools import wraps
import re
from typing import (
TYPE_CHECKING,
Expand Down Expand Up @@ -29,6 +30,7 @@
ArrayLike,
Dtype,
DtypeObj,
F,
Shape,
final,
)
Expand Down Expand Up @@ -118,6 +120,28 @@
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray


def maybe_split(meth: F) -> F:
"""
If we have a multi-column block, split and operate block-wise. Otherwise
use the original method.
"""

@wraps(meth)
def newfunc(self, *args, **kwargs) -> List[Block]:

if self.ndim == 1 or self.shape[0] == 1:
return meth(self, *args, **kwargs)
else:
# Split and operate column-by-column
res_blocks = []
for nb in self._split():
rbs = meth(nb, *args, **kwargs)
res_blocks.extend(rbs)
return res_blocks

return cast(F, newfunc)


class Block(PandasObject):
"""
Canonical n-dimensional unit of homogeneous dtype contained in a pandas
Expand Down Expand Up @@ -489,17 +513,19 @@ def fillna(
# we can't process the value, but nothing to do
return [self] if inplace else [self.copy()]

# operate column-by-column
def f(mask, val, idx):
block = self.coerce_to_target_dtype(value)

# slice out our block
if idx is not None:
# i.e. self.ndim == 2
block = block.getitem_block(slice(idx, idx + 1))
return block.fillna(value, limit=limit, inplace=inplace, downcast=None)
elif self.ndim == 1 or self.shape[0] == 1:
blk = self.coerce_to_target_dtype(value)
# bc we have already cast, inplace=True may avoid an extra copy
return blk.fillna(value, limit=limit, inplace=True, downcast=None)

return self.split_and_operate(None, f, inplace)
else:
# operate column-by-column
res_blocks = []
nbs = self._split()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you can remove _split entirely and just use the decorator, I think that might be better.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

trouble is we also use it in BlockManager.get_bool_data

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ohh, i think i may now understand the confusion.

is the confusing thing "why are we iterating over self._split() here instead of using the decorator? i.e. why not use the decorator everywhere?"?

if so, the reason is bc in this method, the conditions under which we want to split are slightly different from the conditions checked in the decorator. im hopeful we can get some further simplification, but dont want to get it by jamming too much complexity into maybe_split (which is what got split_and_operate confusing in the first place)

for nb in nbs:
rbs = nb.fillna(value, limit=limit, inplace=inplace, downcast=None)
res_blocks.extend(rbs)
return res_blocks

@final
def _split(self) -> List[Block]:
Expand Down Expand Up @@ -625,13 +651,17 @@ def downcast(self, dtypes=None) -> List[Block]:
elif dtypes != "infer":
raise AssertionError("dtypes as dict is not supported yet")

# operate column-by-column
# this is expensive as it splits the blocks items-by-item
def f(mask, val, idx):
val = maybe_downcast_to_dtype(val, dtype="infer")
return val
return self._downcast_2d()

return self.split_and_operate(None, f, False)
@maybe_split
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I find using a decorator & having _split very confusing.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hah i find split_and_operate (the non-decorator alternative) confusing. maybe it can be simplified now that putmask no longer uses it (so one or more of its args can be removed)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so i can re-write split_and_operate to be basically just the if self.ndim == 2 and self.shape[0] != 1 part of maybe_split, but then every method that is decorated in this PR looks like:

def foo(self, *args, **kwargs):
    if self.ndim == 2 and self.shape[0] != 1:
       return self._split_and_operate(Block.foo, *args, **kwargs)
    [...]

and avoiding that repetition seems like exactly the use case for a decorator

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tried this out and it gets more verbose bc we have to write out *args/**kwargs each time

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mixing these approaches just makes this more confusing. pick.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, will update shortly.

def _downcast_2d(self) -> List[Block]:
"""
downcast specialized to 2D case post-validation.

Refactored to allow use of maybe_split.
"""
new_values = maybe_downcast_to_dtype(self.values, dtype="infer")
return [self.make_block(new_values)]

@final
def astype(self, dtype, copy: bool = False, errors: str = "raise"):
Expand Down Expand Up @@ -2119,26 +2149,15 @@ def is_bool(self):
"""
return lib.is_bool_array(self.values.ravel("K"))

@maybe_split
def reduce(self, func, ignore_failures: bool = False) -> List[Block]:
"""
For object-dtype, we operate column-wise.
"""
assert self.ndim == 2

values = self.values
if len(values) > 1:
# split_and_operate expects func with signature (mask, values, inplace)
def mask_func(mask, values, inplace):
if values.ndim == 1:
values = values.reshape(1, -1)
return func(values)

return self.split_and_operate(
None, mask_func, False, ignore_failures=ignore_failures
)

try:
res = func(values)
res = func(self.values)
except TypeError:
if not ignore_failures:
raise
Expand All @@ -2149,6 +2168,7 @@ def mask_func(mask, values, inplace):
res = res.reshape(1, -1)
return [self.make_block_same_class(res)]

@maybe_split
def convert(
self,
copy: bool = True,
Expand All @@ -2160,30 +2180,15 @@ def convert(
attempt to cast any object types to better types return a copy of
the block (if copy = True) by definition we ARE an ObjectBlock!!!!!
"""

# operate column-by-column
def f(mask, val, idx):
shape = val.shape
values = soft_convert_objects(
val.ravel(),
datetime=datetime,
numeric=numeric,
timedelta=timedelta,
copy=copy,
)
if isinstance(values, np.ndarray):
# TODO(EA2D): allow EA once reshape is supported
values = values.reshape(shape)

return values

if self.ndim == 2:
blocks = self.split_and_operate(None, f, False)
else:
values = f(None, self.values.ravel(), None)
blocks = [self.make_block(values)]

return blocks
res_values = soft_convert_objects(
self.values.ravel(),
datetime=datetime,
numeric=numeric,
timedelta=timedelta,
copy=copy,
)
res_values = ensure_block_shape(res_values, self.ndim)
return [self.make_block(res_values)]

def _maybe_downcast(self, blocks: List[Block], downcast=None) -> List[Block]:

Expand Down