Skip to content

CLN: small ops optimizations #28036

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Aug 26, 2019
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 17 additions & 11 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5294,12 +5294,19 @@ def _combine_frame(self, other, func, fill_value=None, level=None):
this, other = self.align(other, join="outer", level=level, copy=False)
new_index, new_columns = this.index, this.columns

def _arith_op(left, right):
# for the mixed_type case where we iterate over columns,
# _arith_op(left, right) is equivalent to
# left._binop(right, func, fill_value=fill_value)
left, right = ops.fill_binop(left, right, fill_value)
return func(left, right)
if fill_value is None:
# since _arith_op may be called in a loop, avoid function call
# overhead if possible by doing this check once
_arith_op = func

else:

def _arith_op(left, right):
# for the mixed_type case where we iterate over columns,
# _arith_op(left, right) is equivalent to
# left._binop(right, func, fill_value=fill_value)
left, right = ops.fill_binop(left, right, fill_value)
return func(left, right)

if ops.should_series_dispatch(this, other, func):
# iterate over columns
Expand All @@ -5312,7 +5319,7 @@ def _arith_op(left, right):

def _combine_match_index(self, other, func, level=None):
left, right = self.align(other, join="outer", axis=0, level=level, copy=False)
assert left.index.equals(right.index)
# at this point we have `left.index.equals(right.index)`
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we want to remove the asserts here? If someone is that set on getting this optimization couldn't they just use the -OO flag?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i put these assertions in when trying to understand the ops code a while back. having a comment to the same effect performs the same task for the reader.


if left._is_mixed_type or right._is_mixed_type:
# operate column-wise; avoid costly object-casting in `.values`
Expand All @@ -5325,14 +5332,13 @@ def _combine_match_index(self, other, func, level=None):
new_data, index=left.index, columns=self.columns, copy=False
)

def _combine_match_columns(self, other, func, level=None):
assert isinstance(other, Series)
def _combine_match_columns(self, other: Series, func, level=None):
left, right = self.align(other, join="outer", axis=1, level=level, copy=False)
assert left.columns.equals(right.index)
# at this point we have `left.columns.equals(right.index)`
return ops.dispatch_to_series(left, right, func, axis="columns")

def _combine_const(self, other, func):
assert lib.is_scalar(other) or np.ndim(other) == 0
# scalar other or np.ndim(other) == 0
return ops.dispatch_to_series(self, other, func)

def combine(self, other, func, fill_value=None, overwrite=True):
Expand Down
10 changes: 5 additions & 5 deletions pandas/core/ops/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def maybe_upcast_for_op(obj, shape: Tuple[int, ...]):
# np.timedelta64(3, 'D') / 2 == np.timedelta64(1, 'D')
return Timedelta(obj)

elif isinstance(obj, np.ndarray) and is_timedelta64_dtype(obj):
elif isinstance(obj, np.ndarray) and is_timedelta64_dtype(obj.dtype):
# GH#22390 Unfortunately we need to special-case right-hand
# timedelta64 dtypes because numpy casts integer dtypes to
# timedelta64 when operating with timedelta64
Expand Down Expand Up @@ -755,7 +755,7 @@ def na_op(x, y):
assert not isinstance(y, (list, ABCSeries, ABCIndexClass))
if isinstance(y, np.ndarray):
# bool-bool dtype operations should be OK, should not get here
assert not (is_bool_dtype(x) and is_bool_dtype(y))
assert not (is_bool_dtype(x.dtype) and is_bool_dtype(y.dtype))
x = ensure_object(x)
y = ensure_object(y)
result = libops.vec_binop(x, y, op)
Expand Down Expand Up @@ -804,7 +804,7 @@ def wrapper(self, other):

else:
# scalars, list, tuple, np.array
is_other_int_dtype = is_integer_dtype(np.asarray(other))
is_other_int_dtype = is_integer_dtype(np.asarray(other).dtype)
if is_list_like(other) and not isinstance(other, np.ndarray):
# TODO: Can we do this before the is_integer_dtype check?
# could the is_integer_dtype check be checking the wrong
Expand Down Expand Up @@ -988,10 +988,10 @@ def f(self, other, axis=default_axis, level=None, fill_value=None):
self, other, pass_op, fill_value=fill_value, axis=axis, level=level
)
else:
# in this case we always have `np.ndim(other) == 0`
if fill_value is not None:
self = self.fillna(fill_value)

assert np.ndim(other) == 0
return self._combine_const(other, op)

f.__name__ = op_name
Expand Down Expand Up @@ -1032,7 +1032,7 @@ def f(self, other, axis=default_axis, level=None):
self, other, na_op, fill_value=None, axis=axis, level=level
)
else:
assert np.ndim(other) == 0, other
# in this case we always have `np.ndim(other) == 0`
return self._combine_const(other, na_op)

f.__name__ = op_name
Expand Down
12 changes: 6 additions & 6 deletions pandas/core/ops/array_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
find_common_type,
maybe_upcast_putmask,
)
from pandas.core.dtypes.common import is_object_dtype, is_period_dtype, is_scalar
from pandas.core.dtypes.common import is_object_dtype, is_scalar
from pandas.core.dtypes.generic import ABCIndex, ABCSeries
from pandas.core.dtypes.missing import notna

Expand Down Expand Up @@ -57,9 +57,9 @@ def masked_arith_op(x, y, op):
dtype = find_common_type([x.dtype, y.dtype])
result = np.empty(x.size, dtype=dtype)

# PeriodIndex.ravel() returns int64 dtype, so we have
# to work around that case. See GH#19956
yrav = y if is_period_dtype(y) else y.ravel()
# NB: ravel() is only safe since y is ndarray; for e.g. PeriodIndex
# we would get int64 dtype, see GH#19956
yrav = y.ravel()
mask = notna(xrav) & notna(yrav)

if yrav.shape != mask.shape:
Expand All @@ -82,9 +82,9 @@ def masked_arith_op(x, y, op):
mask = notna(xrav)

# 1 ** np.nan is 1. So we have to unmask those.
if op == pow:
if op is pow:
mask = np.where(x == 1, False, mask)
elif op == rpow:
elif op is rpow:
mask = np.where(y == 1, False, mask)

if mask.any():
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/ops/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def fill_zeros(result, x, y, name, fill):

Mask the nan's from x.
"""
if fill is None or is_float_dtype(result):
if fill is None or is_float_dtype(result.dtype):
return result

if name.startswith(("r", "__r")):
Expand All @@ -55,7 +55,7 @@ def fill_zeros(result, x, y, name, fill):
if is_scalar_type:
y = np.array(y)

if is_integer_dtype(y):
if is_integer_dtype(y.dtype):

if (y == 0).any():

Expand Down
5 changes: 1 addition & 4 deletions pandas/core/sparse/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -540,9 +540,6 @@ def _combine_frame(self, other, func, fill_value=None, level=None):
this, other = self.align(other, join="outer", level=level, copy=False)
new_index, new_columns = this.index, this.columns

if self.empty and other.empty:
return self._constructor(index=new_index).__finalize__(self)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is covered by a test?

(in general, I would rather leave sparse alone, this is deprecated anyway)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes it is.

I understand the impulse to leave sparse alone, but ATM some of it is a barrier to simplifying ops code

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As I was just saying on the other PR as well, there is also the option to just leave sparse as is, but still go forward with simplifying the ops code for non-sparse

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can also make it easier for you to focus only on the ops code for normal frame (without additional complexity of sharing with sparse)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure I understand. core.ops functions call the DataFrame/SparseDataFrame methods, so anything I want to do to change the behavior of the non-sparse methods needs to be reflected in the sparse methods


new_data = {}
if fill_value is not None:
# TODO: be a bit more intelligent here
Expand All @@ -569,13 +566,13 @@ def _combine_frame(self, other, func, fill_value=None, level=None):
).__finalize__(self)

def _combine_match_index(self, other, func, level=None):
new_data = {}

if level is not None:
raise NotImplementedError("'level' argument is not supported")

this, other = self.align(other, join="outer", axis=0, level=level, copy=False)

new_data = {}
for col, series in this.items():
new_data[col] = func(series.values, other.values)

Expand Down