Skip to content

Commit 15d32bb

Browse files
authored
[CLN] Dispatch (some) Frame ops to Series, avoiding _data.eval (pandas-dev#22019)
* avoid casting to object dtype in mixed-type frames * Dispatch to Series ops in _combine_match_columns * comment * docstring * flake8 fixup * dont bother with try_cast_result * revert non-central change * simplify * revert try_cast_results * revert non-central changes * Fixup typo syntaxerror * simplify assertion * use dispatch_to_series in combine_match_columns * Pass unwrapped op where appropriate * catch correct error * whatsnew note * comment * whatsnew section * remove unnecessary tester * doc fixup
1 parent 3e3256b commit 15d32bb

File tree

7 files changed

+70
-37
lines changed

7 files changed

+70
-37
lines changed

doc/source/whatsnew/v0.24.0.txt

+29
Original file line numberDiff line numberDiff line change
@@ -532,6 +532,35 @@ Current Behavior:
532532
...
533533
OverflowError: Trying to coerce negative values to unsigned integers
534534

535+
.. _whatsnew_0240.api.crosstab_dtypes
536+
537+
Crosstab Preserves Dtypes
538+
^^^^^^^^^^^^^^^^^^^^^^^^^
539+
540+
:func:`crosstab` will preserve now dtypes in some cases that previously would
541+
cast from integer dtype to floating dtype (:issue:`22019`)
542+
543+
Previous Behavior:
544+
545+
.. code-block:: ipython
546+
547+
In [3]: df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4],
548+
...: 'c': [1, 1, np.nan, 1, 1]})
549+
In [4]: pd.crosstab(df.a, df.b, normalize='columns')
550+
Out[4]:
551+
b 3 4
552+
a
553+
1 0.5 0.0
554+
2 0.5 1.0
555+
556+
Current Behavior:
557+
558+
.. code-block:: ipython
559+
560+
In [3]: df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4],
561+
...: 'c': [1, 1, np.nan, 1, 1]})
562+
In [4]: pd.crosstab(df.a, df.b, normalize='columns')
563+
535564
Datetimelike API Changes
536565
^^^^^^^^^^^^^^^^^^^^^^^^
537566

pandas/core/frame.py

+1-6
Original file line numberDiff line numberDiff line change
@@ -4899,7 +4899,6 @@ def _arith_op(left, right):
48994899
copy=False)
49004900

49014901
def _combine_match_index(self, other, func, level=None):
4902-
assert isinstance(other, Series)
49034902
left, right = self.align(other, join='outer', axis=0, level=level,
49044903
copy=False)
49054904
assert left.index.equals(right.index)
@@ -4919,11 +4918,7 @@ def _combine_match_columns(self, other, func, level=None, try_cast=True):
49194918
left, right = self.align(other, join='outer', axis=1, level=level,
49204919
copy=False)
49214920
assert left.columns.equals(right.index)
4922-
4923-
new_data = left._data.eval(func=func, other=right,
4924-
axes=[left.columns, self.index],
4925-
try_cast=try_cast)
4926-
return self._constructor(new_data)
4921+
return ops.dispatch_to_series(left, right, func, axis="columns")
49274922

49284923
def _combine_const(self, other, func, errors='raise', try_cast=True):
49294924
if lib.is_scalar(other) or np.ndim(other) == 0:

pandas/core/ops.py

+15-2
Original file line numberDiff line numberDiff line change
@@ -1666,7 +1666,7 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0):
16661666
# -----------------------------------------------------------------------------
16671667
# DataFrame
16681668

1669-
def dispatch_to_series(left, right, func, str_rep=None):
1669+
def dispatch_to_series(left, right, func, str_rep=None, axis=None):
16701670
"""
16711671
Evaluate the frame operation func(left, right) by evaluating
16721672
column-by-column, dispatching to the Series implementation.
@@ -1677,6 +1677,7 @@ def dispatch_to_series(left, right, func, str_rep=None):
16771677
right : scalar or DataFrame
16781678
func : arithmetic or comparison operator
16791679
str_rep : str or None, default None
1680+
axis : {None, 0, 1, "index", "columns"}
16801681
16811682
Returns
16821683
-------
@@ -1700,6 +1701,15 @@ def column_op(a, b):
17001701
return {i: func(a.iloc[:, i], b.iloc[:, i])
17011702
for i in range(len(a.columns))}
17021703

1704+
elif isinstance(right, ABCSeries) and axis == "columns":
1705+
# We only get here if called via left._combine_match_columns,
1706+
# in which case we specifically want to operate row-by-row
1707+
assert right.index.equals(left.columns)
1708+
1709+
def column_op(a, b):
1710+
return {i: func(a.iloc[:, i], b.iloc[i])
1711+
for i in range(len(a.columns))}
1712+
17031713
elif isinstance(right, ABCSeries):
17041714
assert right.index.equals(left.index) # Handle other cases later
17051715

@@ -1844,7 +1854,10 @@ def f(self, other, axis=default_axis, level=None, fill_value=None):
18441854
pass_op = op if should_series_dispatch(self, other, op) else na_op
18451855
return self._combine_frame(other, pass_op, fill_value, level)
18461856
elif isinstance(other, ABCSeries):
1847-
return _combine_series_frame(self, other, na_op,
1857+
# For these values of `axis`, we end up dispatching to Series op,
1858+
# so do not want the masked op.
1859+
pass_op = op if axis in [0, "columns", None] else na_op
1860+
return _combine_series_frame(self, other, pass_op,
18481861
fill_value=fill_value, axis=axis,
18491862
level=level, try_cast=True)
18501863
else:

pandas/tests/arithmetic/test_timedelta64.py

+14-20
Original file line numberDiff line numberDiff line change
@@ -505,33 +505,25 @@ def test_tdi_add_dt64_array(self, box_df_broadcast_failure):
505505
# ------------------------------------------------------------------
506506
# Operations with int-like others
507507

508-
def test_td64arr_add_int_series_invalid(self, box_df_broadcast_failure,
509-
tdser):
510-
box = box_df_broadcast_failure
508+
def test_td64arr_add_int_series_invalid(self, box, tdser):
511509
tdser = tm.box_expected(tdser, box)
512510
err = TypeError if box is not pd.Index else NullFrequencyError
513511
with pytest.raises(err):
514512
tdser + Series([2, 3, 4])
515513

516-
def test_td64arr_radd_int_series_invalid(self, box_df_broadcast_failure,
517-
tdser):
518-
box = box_df_broadcast_failure
514+
def test_td64arr_radd_int_series_invalid(self, box, tdser):
519515
tdser = tm.box_expected(tdser, box)
520516
err = TypeError if box is not pd.Index else NullFrequencyError
521517
with pytest.raises(err):
522518
Series([2, 3, 4]) + tdser
523519

524-
def test_td64arr_sub_int_series_invalid(self, box_df_broadcast_failure,
525-
tdser):
526-
box = box_df_broadcast_failure
520+
def test_td64arr_sub_int_series_invalid(self, box, tdser):
527521
tdser = tm.box_expected(tdser, box)
528522
err = TypeError if box is not pd.Index else NullFrequencyError
529523
with pytest.raises(err):
530524
tdser - Series([2, 3, 4])
531525

532-
def test_td64arr_rsub_int_series_invalid(self, box_df_broadcast_failure,
533-
tdser):
534-
box = box_df_broadcast_failure
526+
def test_td64arr_rsub_int_series_invalid(self, box, tdser):
535527
tdser = tm.box_expected(tdser, box)
536528
err = TypeError if box is not pd.Index else NullFrequencyError
537529
with pytest.raises(err):
@@ -605,9 +597,10 @@ def test_td64arr_add_sub_numeric_scalar_invalid(self, box, scalar, tdser):
605597
Series([1, 2, 3])
606598
# TODO: Add DataFrame in here?
607599
], ids=lambda x: type(x).__name__)
608-
def test_td64arr_add_sub_numeric_arr_invalid(
609-
self, box_df_broadcast_failure, vec, dtype, tdser):
610-
box = box_df_broadcast_failure
600+
def test_td64arr_add_sub_numeric_arr_invalid(self, box, vec, dtype, tdser):
601+
if box is pd.DataFrame and not isinstance(vec, Series):
602+
raise pytest.xfail(reason="Tries to broadcast incorrectly")
603+
611604
tdser = tm.box_expected(tdser, box)
612605
err = TypeError
613606
if box is pd.Index and not dtype.startswith('float'):
@@ -930,9 +923,9 @@ def test_td64arr_sub_offset_array(self, box_df_broadcast_failure):
930923
@pytest.mark.parametrize('names', [(None, None, None),
931924
('foo', 'bar', None),
932925
('foo', 'foo', 'foo')])
933-
def test_td64arr_with_offset_series(self, names, box_df_broadcast_failure):
926+
def test_td64arr_with_offset_series(self, names, box_df_fail):
934927
# GH#18849
935-
box = box_df_broadcast_failure
928+
box = box_df_fail
936929
box2 = Series if box is pd.Index else box
937930

938931
tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00'],
@@ -963,10 +956,11 @@ def test_td64arr_with_offset_series(self, names, box_df_broadcast_failure):
963956
tm.assert_equal(res3, expected_sub)
964957

965958
@pytest.mark.parametrize('obox', [np.array, pd.Index, pd.Series])
966-
def test_td64arr_addsub_anchored_offset_arraylike(
967-
self, obox, box_df_broadcast_failure):
959+
def test_td64arr_addsub_anchored_offset_arraylike(self, obox, box):
968960
# GH#18824
969-
box = box_df_broadcast_failure
961+
if box is pd.DataFrame and obox is not pd.Series:
962+
raise pytest.xfail(reason="Attempts to broadcast incorrectly")
963+
970964
tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00'])
971965
tdi = tm.box_expected(tdi, box)
972966

pandas/tests/frame/test_axis_select_reindex.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -721,7 +721,7 @@ def test_align_int_fill_bug(self):
721721

722722
result = df1 - df1.mean()
723723
expected = df2 - df2.mean()
724-
assert_frame_equal(result, expected)
724+
assert_frame_equal(result.astype('f8'), expected)
725725

726726
def test_align_multiindex(self):
727727
# GH 10665

pandas/tests/reshape/test_pivot.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -1566,8 +1566,9 @@ def test_crosstab_normalize(self):
15661566
full_normal)
15671567
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='index'),
15681568
row_normal)
1569-
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='columns'),
1570-
col_normal)
1569+
tm.assert_frame_equal(
1570+
pd.crosstab(df.a, df.b, normalize='columns').astype('f8'),
1571+
col_normal)
15711572
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=1),
15721573
pd.crosstab(df.a, df.b, normalize='columns'))
15731574
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=0),
@@ -1600,7 +1601,8 @@ def test_crosstab_normalize(self):
16001601
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='index',
16011602
margins=True), row_normal_margins)
16021603
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='columns',
1603-
margins=True), col_normal_margins)
1604+
margins=True).astype('f8'),
1605+
col_normal_margins)
16041606
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=True,
16051607
margins=True), all_normal_margins)
16061608

pandas/tests/series/test_operators.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -758,9 +758,6 @@ def test_operators_bitwise(self):
758758
def test_scalar_na_cmp_corners(self):
759759
s = Series([2, 3, 4, 5, 6, 7, 8, 9, 10])
760760

761-
def tester(a, b):
762-
return a & b
763-
764761
with pytest.raises(TypeError):
765762
s & datetime(2005, 1, 1)
766763

@@ -780,8 +777,11 @@ def tester(a, b):
780777
# this is an alignment issue; these are equivalent
781778
# https://github.com/pandas-dev/pandas/issues/5284
782779

783-
pytest.raises(ValueError, lambda: d.__and__(s, axis='columns'))
784-
pytest.raises(ValueError, tester, s, d)
780+
with pytest.raises(TypeError):
781+
d.__and__(s, axis='columns')
782+
783+
with pytest.raises(TypeError):
784+
s & d
785785

786786
# this is wrong as its not a boolean result
787787
# result = d.__and__(s,axis='index')

0 commit comments

Comments
 (0)