-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
REF: simplify operating-columnwise dispatch #40256
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
bd97cfd
b38bbed
3d7951c
78b15d3
d2bb792
1caeeda
493dd53
996012d
46e2b42
cdf7de3
a0bf89a
52ae4e3
a71e974
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
from __future__ import annotations | ||
|
||
from functools import wraps | ||
import re | ||
from typing import ( | ||
TYPE_CHECKING, | ||
|
@@ -29,6 +30,7 @@ | |
ArrayLike, | ||
Dtype, | ||
DtypeObj, | ||
F, | ||
Shape, | ||
final, | ||
) | ||
|
@@ -118,6 +120,28 @@ | |
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray | ||
|
||
|
||
def maybe_split(meth: F) -> F: | ||
""" | ||
If we have a multi-column block, split and operate block-wise. Otherwise | ||
use the original method. | ||
""" | ||
|
||
@wraps(meth) | ||
def newfunc(self, *args, **kwargs) -> List[Block]: | ||
|
||
if self.ndim == 1 or self.shape[0] == 1: | ||
return meth(self, *args, **kwargs) | ||
else: | ||
# Split and operate column-by-column | ||
res_blocks = [] | ||
for nb in self._split(): | ||
rbs = meth(nb, *args, **kwargs) | ||
res_blocks.extend(rbs) | ||
return res_blocks | ||
|
||
return cast(F, newfunc) | ||
|
||
|
||
class Block(PandasObject): | ||
""" | ||
Canonical n-dimensional unit of homogeneous dtype contained in a pandas | ||
|
@@ -489,17 +513,19 @@ def fillna( | |
# we can't process the value, but nothing to do | ||
return [self] if inplace else [self.copy()] | ||
|
||
# operate column-by-column | ||
def f(mask, val, idx): | ||
block = self.coerce_to_target_dtype(value) | ||
|
||
# slice out our block | ||
if idx is not None: | ||
# i.e. self.ndim == 2 | ||
block = block.getitem_block(slice(idx, idx + 1)) | ||
return block.fillna(value, limit=limit, inplace=inplace, downcast=None) | ||
elif self.ndim == 1 or self.shape[0] == 1: | ||
blk = self.coerce_to_target_dtype(value) | ||
# bc we have already cast, inplace=True may avoid an extra copy | ||
return blk.fillna(value, limit=limit, inplace=True, downcast=None) | ||
|
||
return self.split_and_operate(None, f, inplace) | ||
else: | ||
# operate column-by-column | ||
res_blocks = [] | ||
nbs = self._split() | ||
for nb in nbs: | ||
rbs = nb.fillna(value, limit=limit, inplace=inplace, downcast=None) | ||
res_blocks.extend(rbs) | ||
return res_blocks | ||
|
||
@final | ||
def _split(self) -> List[Block]: | ||
|
@@ -625,13 +651,17 @@ def downcast(self, dtypes=None) -> List[Block]: | |
elif dtypes != "infer": | ||
raise AssertionError("dtypes as dict is not supported yet") | ||
|
||
# operate column-by-column | ||
# this is expensive as it splits the blocks items-by-item | ||
def f(mask, val, idx): | ||
val = maybe_downcast_to_dtype(val, dtype="infer") | ||
return val | ||
return self._downcast_2d() | ||
|
||
return self.split_and_operate(None, f, False) | ||
@maybe_split | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I find using a decorator & having _split very confusing. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hah i find split_and_operate (the non-decorator alternative) confusing. maybe it can be simplified now that putmask no longer uses it (so one or more of its args can be removed) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so i can re-write split_and_operate to be basically just the
and avoiding that repetition seems like exactly the use case for a decorator There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. tried this out and it gets more verbose bc we have to write out *args/**kwargs each time There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. mixing these approaches just makes this more confusing. pick. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok, will update shortly. |
||
def _downcast_2d(self) -> List[Block]: | ||
""" | ||
downcast specialized to 2D case post-validation. | ||
|
||
Refactored to allow use of maybe_split. | ||
""" | ||
new_values = maybe_downcast_to_dtype(self.values, dtype="infer") | ||
return [self.make_block(new_values)] | ||
|
||
@final | ||
def astype(self, dtype, copy: bool = False, errors: str = "raise"): | ||
|
@@ -2119,26 +2149,15 @@ def is_bool(self): | |
""" | ||
return lib.is_bool_array(self.values.ravel("K")) | ||
|
||
@maybe_split | ||
def reduce(self, func, ignore_failures: bool = False) -> List[Block]: | ||
""" | ||
For object-dtype, we operate column-wise. | ||
""" | ||
assert self.ndim == 2 | ||
|
||
values = self.values | ||
if len(values) > 1: | ||
# split_and_operate expects func with signature (mask, values, inplace) | ||
def mask_func(mask, values, inplace): | ||
if values.ndim == 1: | ||
values = values.reshape(1, -1) | ||
return func(values) | ||
|
||
return self.split_and_operate( | ||
None, mask_func, False, ignore_failures=ignore_failures | ||
) | ||
|
||
try: | ||
res = func(values) | ||
res = func(self.values) | ||
except TypeError: | ||
if not ignore_failures: | ||
raise | ||
|
@@ -2149,6 +2168,7 @@ def mask_func(mask, values, inplace): | |
res = res.reshape(1, -1) | ||
return [self.make_block_same_class(res)] | ||
|
||
@maybe_split | ||
def convert( | ||
self, | ||
copy: bool = True, | ||
|
@@ -2160,30 +2180,15 @@ def convert( | |
attempt to cast any object types to better types return a copy of | ||
the block (if copy = True) by definition we ARE an ObjectBlock!!!!! | ||
""" | ||
|
||
# operate column-by-column | ||
def f(mask, val, idx): | ||
shape = val.shape | ||
values = soft_convert_objects( | ||
val.ravel(), | ||
datetime=datetime, | ||
numeric=numeric, | ||
timedelta=timedelta, | ||
copy=copy, | ||
) | ||
if isinstance(values, np.ndarray): | ||
# TODO(EA2D): allow EA once reshape is supported | ||
values = values.reshape(shape) | ||
|
||
return values | ||
|
||
if self.ndim == 2: | ||
blocks = self.split_and_operate(None, f, False) | ||
else: | ||
values = f(None, self.values.ravel(), None) | ||
blocks = [self.make_block(values)] | ||
|
||
return blocks | ||
res_values = soft_convert_objects( | ||
self.values.ravel(), | ||
datetime=datetime, | ||
numeric=numeric, | ||
timedelta=timedelta, | ||
copy=copy, | ||
) | ||
res_values = ensure_block_shape(res_values, self.ndim) | ||
return [self.make_block(res_values)] | ||
|
||
def _maybe_downcast(self, blocks: List[Block], downcast=None) -> List[Block]: | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if you can remove _split entirely and just use the decorator, I think that might be better.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
trouble is we also use it in BlockManager.get_bool_data
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ohh, i think i may now understand the confusion.
is the confusing thing "why are we iterating over self._split() here instead of using the decorator? i.e. why not use the decorator everywhere?"?
if so, the reason is bc in this method, the conditions under which we want to split are slightly different from the conditions checked in the decorator. im hopeful we can get some further simplification, but dont want to get it by jamming too much complexity into maybe_split (which is what got split_and_operate confusing in the first place)