Skip to content

REF: de-duplicate DataFrame/SparseDataFrame arithmetic code #23414

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 18 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4935,6 +4935,33 @@ def reorder_levels(self, order, axis=0):
# ----------------------------------------------------------------------
# Arithmetic / combination related

def _wrap_dispatched_op(self, result, other, func, axis=None):
"""
Wrap the result of an arithmetic/comparison operation performed
via ops.dispatch_to_series in a properly-indexed DataFrame.

Parameters
----------
result : dict[int:Series]
other : DataFrame, Series, or scalar
func : binary operator
axis : {0, 1, "index", "columns"}

Returns
-------
DataFrame

Notes
-----
The `other`, `func`, and `axis` arguments are not used here but are
included to keep the signature consistent with that in SparseDataFrame.
"""
result = self._constructor(result, index=self.index, copy=False)
# Pin columns instead of passing to constructor for compat with
# non-unique columns case
result.columns = self.columns
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does this need to call __finalize__

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This maintains the current behavior.

At some point we can/should make a concerted effort to be internally-consistent about calling __finalize__, but that is a large, orthogonal undertaking. Calling it here without doing it elsewhere would make things more inconsistent.

return result

def _combine_frame(self, other, func, fill_value=None, level=None):
this, other = self.align(other, join='outer', level=level, copy=False)
new_index, new_columns = this.index, this.columns
Expand Down
20 changes: 7 additions & 13 deletions pandas/core/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -792,7 +792,7 @@ def fill_binop(left, right, fill_value):
return left, right


def mask_cmp_op(x, y, op, allowed_types):
def mask_cmp_op(x, y, op):
"""
Apply the function `op` to only non-null points in x and y.

Expand All @@ -801,16 +801,15 @@ def mask_cmp_op(x, y, op, allowed_types):
x : array-like
y : array-like
op : binary operation
allowed_types : class or tuple of classes

Returns
-------
result : ndarray[bool]
"""
# TODO: Can we make the allowed_types arg unnecessary?
# TODO: can we do this without casting to list?
xrav = x.ravel()
result = np.empty(x.size, dtype=bool)
if isinstance(y, allowed_types):
if isinstance(y, (np.ndarray, ABCSeries)):
yrav = y.ravel()
mask = notna(xrav) & notna(yrav)
result[mask] = op(np.array(list(xrav[mask])),
Expand Down Expand Up @@ -1006,12 +1005,7 @@ def column_op(a, b):
raise NotImplementedError(right)

new_data = expressions.evaluate(column_op, str_rep, left, right)

result = left._constructor(new_data, index=left.index, copy=False)
# Pin columns instead of passing to constructor for compat with
# non-unique columns case
result.columns = left.columns
return result
return left._wrap_dispatched_op(new_data, right, func, axis=axis)


def dispatch_to_index_op(op, left, right, index_class):
Expand Down Expand Up @@ -1896,7 +1890,7 @@ def na_op(x, y):
with np.errstate(invalid='ignore'):
result = op(x, y)
except TypeError:
result = mask_cmp_op(x, y, op, (np.ndarray, ABCSeries))
result = mask_cmp_op(x, y, op)
return result

@Appender('Wrapper for flexible comparison methods {name}'
Expand Down Expand Up @@ -1948,7 +1942,7 @@ def f(self, other):
else:

# straight boolean comparisons we want to allow all columns
# (regardless of dtype to pass thru) See #4537 for discussion.
# (regardless of dtype to pass thru) See GH#4537 for discussion.
res = self._combine_const(other, func)
return res.fillna(True).astype(bool)

Expand Down Expand Up @@ -1986,7 +1980,7 @@ def na_op(x, y):
try:
result = expressions.evaluate(op, str_rep, x, y)
except TypeError:
result = mask_cmp_op(x, y, op, np.ndarray)
result = mask_cmp_op(x, y, op)
return result

@Appender('Wrapper for comparison method {name}'.format(name=op_name))
Expand Down
103 changes: 62 additions & 41 deletions pandas/core/sparse/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,9 +339,10 @@ def _apply_columns(self, func):
for col, series in compat.iteritems(self):
new_data[col] = func(series)

return self._constructor(
data=new_data, index=self.index, columns=self.columns,
default_fill_value=self.default_fill_value).__finalize__(self)
# pass dummy arguments for func and axis; `other` just needs to be
# a scalar.
return self._wrap_dispatched_op(new_data, other=None,
func=None, axis=None)

def astype(self, dtype):
return self._apply_columns(lambda x: x.astype(dtype))
Expand Down Expand Up @@ -547,6 +548,31 @@ def xs(self, key, axis=0, copy=False):
# ----------------------------------------------------------------------
# Arithmetic-related methods

def _wrap_dispatched_op(self, result, other, func, axis=None):
"""
Wrap the result of an arithmetic/comparison operation performed
via ops.dispatch_to_series in a properly-indexed DataFrame.

Parameters
----------
result : dict[int:Series]
other : DataFrame, Series, scalar
func : binary function
axis : {None, "columns"}

Returns
-------
SparseDataFrame
"""

fill_value = self._get_op_result_fill_value(other, func, axis)

res = self._constructor(result, index=self.index,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are you passing columns to the constructor here, whereas in DataFrame you are setting?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm very specifically not changing the existing behavior.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

well, let's do that then. in reality you could actually call the superclass (maybe should just do that), if you have the ability to pass a fill_value

columns=self.columns,
default_fill_value=fill_value)

return res.__finalize__(self)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why would this call finalize? why would the DataFrame one not?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No idea, that's why I opened #23028.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, we basically need to always call __finalize__ after every op (binary or unary). It prob isn't done consistenly. So I would do it for both here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll do this, but am not wild about it. Current __finalize__/_metadata handling is quarter-assed, and I'd prefer to leave it that way until it is specifically addressed than make it half-assed. Still though, will update.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm this actually makes things less consistent within DataFrame (though slightly more consistent between DataFrame - SparseDataFrame) since lots of ops dont go through this path.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you just call the super function here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The DataFrame constructor doesn't take a fill_value argument. We would end up special-casing within the DataFrame method in a way equivalent to (but messier than and less performant than) overriding in the appropriate subclass.


def _combine_frame(self, other, func, fill_value=None, level=None):
if level is not None:
raise NotImplementedError("'level' argument is not supported")
Expand All @@ -557,46 +583,33 @@ def _combine_frame(self, other, func, fill_value=None, level=None):
if self.empty and other.empty:
return self._constructor(index=new_index).__finalize__(self)

new_data = {}
if fill_value is not None:
# TODO: be a bit more intelligent here
for col in new_columns:
if col in this and col in other:
dleft = this[col].to_dense()
dright = other[col].to_dense()
result = dleft._binop(dright, func, fill_value=fill_value)
result = result.to_sparse(fill_value=this[col].fill_value)
new_data[col] = result
else:
def _arith_op(left, right):
# analogous to _arith_op defined in DataFrame._combine_frame
dleft = left.to_dense()
dright = right.to_dense()
result = dleft._binop(dright, func, fill_value=fill_value)
result = result.to_sparse(fill_value=left.fill_value)
return result

for col in new_columns:
if col in this and col in other:
new_data[col] = func(this[col], other[col])
assert all(col in this and col in other for col in new_columns)

new_fill_value = self._get_op_result_fill_value(other, func)
new_data = {col: _arith_op(this[col], other[col])
for col in new_columns}

return self._constructor(data=new_data, index=new_index,
columns=new_columns,
default_fill_value=new_fill_value
).__finalize__(self)
return this._wrap_dispatched_op(new_data, other, func, axis=None)

def _combine_match_index(self, other, func, level=None):
new_data = {}

if level is not None:
raise NotImplementedError("'level' argument is not supported")

this, other = self.align(other, join='outer', axis=0, level=level,
copy=False)

for col, series in compat.iteritems(this):
new_data[col] = func(series.values, other.values)
new_data = {col: func(this[col].values, other.values)
for col in this.columns}

fill_value = self._get_op_result_fill_value(other, func)

return self._constructor(
new_data, index=this.index, columns=self.columns,
default_fill_value=fill_value).__finalize__(self)
return self._wrap_dispatched_op(new_data, other, func, axis=None)

def _combine_match_columns(self, other, func, level=None):
# patched version of DataFrame._combine_match_columns to account for
Expand All @@ -611,29 +624,33 @@ def _combine_match_columns(self, other, func, level=None):
copy=False)
assert left.columns.equals(right.index)

new_data = {}

for col in left.columns:
new_data[col] = func(left[col], float(right[col]))
new_data = {col: func(left[col], float(right[col]))
for col in left.columns}

return self._constructor(
new_data, index=left.index, columns=left.columns,
default_fill_value=self.default_fill_value).__finalize__(self)
return self._wrap_dispatched_op(new_data, other, func, "columns")

def _combine_const(self, other, func):
return self._apply_columns(lambda x: func(x, other))
new_data = {col: func(self[col], other) for col in self.columns}

return self._wrap_dispatched_op(new_data, other, func, axis=None)

def _get_op_result_fill_value(self, other, func):
def _get_op_result_fill_value(self, other, func, axis=None):
own_default = self.default_fill_value

if isinstance(other, DataFrame):
axis = self._get_axis_name(axis) if axis is not None else None

if axis == "columns":
# i.e. called via _combine_match_columns
fill_value = self.default_fill_value

elif isinstance(other, DataFrame):
# i.e. called from _combine_frame

other_default = getattr(other, 'default_fill_value', np.nan)

# if the fill values are the same use them? or use a valid one
if own_default == other_default:
# TOOD: won't this evaluate as False if both are np.nan?
# TODO: won't this evaluate as False if both are np.nan?
fill_value = own_default
elif np.isnan(own_default) and not np.isnan(other_default):
fill_value = other_default
Expand All @@ -652,6 +669,10 @@ def _get_op_result_fill_value(self, other, func):
fill_value = func(np.float64(own_default),
np.float64(other.fill_value))

elif np.ndim(other) == 0:
# i.e. called via _combine_const
fill_value = self.default_fill_value

else:
raise NotImplementedError(type(other))

Expand Down