-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
[CLN] Dispatch (some) Frame ops to Series, avoiding _data.eval #22019
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
5195dad
1a66906
a45abec
8594c48
108550e
07fb477
946e54a
323f45e
b3cedf1
a0708d1
5d3db89
aa41de3
01c3720
bcb6735
b3ef417
4b2e21a
330a94a
4c4f626
757e2ae
ff96c0d
17f33b6
c30f898
82a7928
52b7102
453ae8e
93887cf
dd115c8
265ec78
db5ca89
f574c24
e6821a2
da2e32c
a6a7f58
b21a8a2
858165f
31d4089
5832c2b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -70,7 +70,8 @@ | |
check_bool_indexer) | ||
from pandas.core.internals import (BlockManager, | ||
create_block_manager_from_arrays, | ||
create_block_manager_from_blocks) | ||
create_block_manager_from_blocks, | ||
try_cast_result) | ||
from pandas.core.series import Series | ||
from pandas.core.arrays import Categorical, ExtensionArray | ||
import pandas.core.algorithms as algorithms | ||
|
@@ -4921,21 +4922,64 @@ def _arith_op(left, right): | |
copy=False) | ||
|
||
def _combine_match_index(self, other, func, level=None): | ||
assert isinstance(other, Series) | ||
left, right = self.align(other, join='outer', axis=0, level=level, | ||
copy=False) | ||
new_data = func(left.values.T, right.values).T | ||
return self._constructor(new_data, | ||
index=left.index, columns=self.columns, | ||
copy=False) | ||
assert left.index.equals(right.index) | ||
|
||
if left._is_mixed_type or right._is_mixed_type: | ||
if self.columns.is_unique: | ||
new_data = {col: func(left[col], right) | ||
for col in left.columns} | ||
result = self._constructor(new_data, | ||
index=left.index, | ||
columns=left.columns, | ||
copy=False) | ||
return result | ||
else: | ||
new_data = [func(left.iloc[:, idx], right) | ||
for idx in range(len(left.columns))] | ||
result = self._constructor(new_data, | ||
index=left.index, | ||
copy=False) | ||
result.columns = left.columns | ||
return result | ||
|
||
else: | ||
# easy case, operate directly on values | ||
result = func(left.values.T, right.values).T | ||
return self._constructor(result, | ||
index=left.index, columns=self.columns, | ||
copy=False) | ||
|
||
def _combine_match_columns(self, other, func, level=None, try_cast=True): | ||
# TODO: `func` passed here is wrapped in core.ops; if we are | ||
# dispatching to Series implementation, should we pass unwrapped? | ||
assert isinstance(other, Series) | ||
left, right = self.align(other, join='outer', axis=1, level=level, | ||
copy=False) | ||
assert left.columns.equals(right.index), (left.columns, right.index) | ||
|
||
new_data = left._data.eval(func=func, other=right, | ||
axes=[left.columns, self.index], | ||
try_cast=try_cast) | ||
return self._constructor(new_data) | ||
new_data = [func(left.iloc[:, n], right.iloc[n]) | ||
for n in range(len(left.columns))] | ||
|
||
if try_cast: | ||
new_data = [try_cast_result(left.iloc[:, n], new_data[n]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pls don't repeat code for the unique / non-unique case There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This can be trimmed down a bit, but is based on the existing |
||
for n in range(len(left.columns))] | ||
|
||
if left.columns.is_unique: | ||
new_data = {left.columns[n]: new_data[n] | ||
for n in range(len(left.columns))} | ||
result = self._constructor(new_data, | ||
index=left.index, columns=left.columns, | ||
copy=False) | ||
return result | ||
|
||
else: | ||
new_data = {i: new_data[i] for i in range(len(new_data))} | ||
result = self._constructor(new_data, index=left.index, copy=False) | ||
result.columns = left.columns | ||
return result | ||
|
||
def _combine_const(self, other, func, errors='raise', try_cast=True): | ||
new_data = self._data.eval(func=func, other=other, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -86,6 +86,55 @@ | |
from pandas.compat import range, map, zip, u | ||
|
||
|
||
def try_cast_result(left, result, dtype=None): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IF you need this function, it should be in panda/core/dtypes/cast.py. I say IF because this looks like lots of other functions there. please see if you can simplify. we want to remove things from internals not add them. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Sure, though I'm hoping we can get rid of it before long.
This is based on the existing
Very much so. The point of this PR (along with another branch in progress) is to remove |
||
""" | ||
Try to cast the result to the original dtype for `left`; we may have | ||
roundtripped thru object in the mean-time. | ||
|
||
Parameters | ||
---------- | ||
left : array-like | ||
result : array-like | ||
dtype : np.dtype, pd.dtype, or None (default None) | ||
|
||
Returns | ||
------- | ||
maybe_casted : same type as `result` | ||
""" | ||
if dtype is None: | ||
dtype = left.dtype | ||
|
||
if (is_integer_dtype(left) or is_bool_dtype(left) or | ||
is_datetime64_dtype(left) or is_datetime64tz_dtype(left)): | ||
pass | ||
elif is_object_dtype(left) and lib.infer_dtype(left) == 'boolean': | ||
# disguised is_bool_dtype | ||
pass | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is nearly cut/paste from the existing |
||
elif is_float_dtype(left) and result.dtype == left.dtype: | ||
|
||
# protect against a bool/object showing up here | ||
if isinstance(dtype, compat.string_types) and dtype == 'infer': | ||
return result | ||
if not isinstance(dtype, type): | ||
dtype = dtype.type | ||
if issubclass(dtype, (np.bool_, np.object_)): | ||
if issubclass(dtype, np.bool_): | ||
if isna(result).all(): | ||
return result.astype(np.bool_) | ||
else: | ||
result = result.astype(np.object_) | ||
result[result == 1] = True | ||
result[result == 0] = False | ||
return result | ||
else: | ||
return result.astype(np.object_) | ||
|
||
return result | ||
|
||
# may need to change the dtype here | ||
return maybe_downcast_to_dtype(result, dtype) | ||
|
||
|
||
class Block(PandasObject): | ||
""" | ||
Canonical n-dimensional unit of homogeneous dtype contained in a pandas | ||
|
@@ -711,34 +760,7 @@ def _try_cast_result(self, result, dtype=None): | |
""" try to cast the result to our original type, we may have | ||
roundtripped thru object in the mean-time | ||
""" | ||
if dtype is None: | ||
dtype = self.dtype | ||
|
||
if self.is_integer or self.is_bool or self.is_datetime: | ||
pass | ||
elif self.is_float and result.dtype == self.dtype: | ||
|
||
# protect against a bool/object showing up here | ||
if isinstance(dtype, compat.string_types) and dtype == 'infer': | ||
return result | ||
if not isinstance(dtype, type): | ||
dtype = dtype.type | ||
if issubclass(dtype, (np.bool_, np.object_)): | ||
if issubclass(dtype, np.bool_): | ||
if isna(result).all(): | ||
return result.astype(np.bool_) | ||
else: | ||
result = result.astype(np.object_) | ||
result[result == 1] = True | ||
result[result == 0] = False | ||
return result | ||
else: | ||
return result.astype(np.object_) | ||
|
||
return result | ||
|
||
# may need to change the dtype here | ||
return maybe_downcast_to_dtype(result, dtype) | ||
return try_cast_result(self, result, dtype) | ||
|
||
def _try_coerce_args(self, values, other): | ||
""" provide coercion to our input arguments """ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -738,7 +738,7 @@ def test_align_int_fill_bug(self): | |
|
||
result = df1 - df1.mean() | ||
expected = df2 - df2.mean() | ||
assert_frame_equal(result, expected) | ||
assert_frame_equal(result.astype('f8'), expected) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why did this change? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as below There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this PR needs a note in the what's new describing this change (as its effectievly an API change) |
||
|
||
def test_align_multiindex(self): | ||
# GH 10665 | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1565,8 +1565,9 @@ def test_crosstab_normalize(self): | |
full_normal) | ||
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='index'), | ||
row_normal) | ||
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='columns'), | ||
col_normal) | ||
tm.assert_frame_equal( | ||
pd.crosstab(df.a, df.b, normalize='columns').astype('f8'), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this changing is not great, the point of these ops is that the dtypes won't change on the pivot There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The change is happening because ATM the casting is done at the Block-level, while with the PR casting is done at the column-level.
and under the PR gives:
i.e. the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the correct way to answer this is ask "was the upcast from int to float the result of coercing to a single dtype for multiple columns?" If so, then we should preserve integer dtype where we can, i.e. For an all-integer In [27]: a = pd.Series([1, 2])
In [28]: pd.crosstab(a, a)
Out[28]:
col_0 1 2
row_0
1 1 0
2 0 1 so this change seems good (and maybe worth explicitly testing rather than astyping the output, to ensure that we don't regress)? |
||
col_normal) | ||
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=1), | ||
pd.crosstab(df.a, df.b, normalize='columns')) | ||
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=0), | ||
|
@@ -1599,7 +1600,8 @@ def test_crosstab_normalize(self): | |
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='index', | ||
margins=True), row_normal_margins) | ||
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='columns', | ||
margins=True), col_normal_margins) | ||
margins=True).astype('f8'), | ||
col_normal_margins) | ||
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=True, | ||
margins=True), all_normal_margins) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are there cases where this might no be true? If not, then let's remove it, as it could be somewhat expensive (though maybe not, since the output of align shares indices.)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This always holds. I added it largely so I wouldn't have to keep going back to core.ops to double-check what was being passed.