From 1102f0dfec5caaad42f00befc0c9c1b5ecb2cb06 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 15 Apr 2020 08:59:14 +0200 Subject: [PATCH 1/5] PERF: operate on arrays instead of Series in DataFrame/DataFrame ops --- pandas/core/ops/__init__.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index c14c4a311d66c..ecd6a65e8bbdd 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -325,8 +325,15 @@ def dispatch_to_series(left, right, func, str_rep=None, axis=None): elif isinstance(right, ABCDataFrame): assert right._indexed_same(left) - def column_op(a, b): - return {i: func(a.iloc[:, i], b.iloc[:, i]) for i in range(len(a.columns))} + array_op = get_array_op(func, str_rep=str_rep) + + arrays = [] + for l, r in zip(left._iter_column_arrays(), right._iter_column_arrays()): + arrays.append(array_op(l, r)) + + return type(left)._from_arrays( + arrays, left.columns, left.index, verify_integrity=False + ) elif isinstance(right, ABCSeries) and axis == "columns": # We only get here if called via _combine_series_frame, From 44ec4f4c5f0f6c15148e570fbec0fc2a7047f95b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 16 Apr 2020 13:54:44 +0200 Subject: [PATCH 2/5] also for frame/series cases --- pandas/core/ops/__init__.py | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 28a8d409d0a4d..cb692c29dd34d 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -310,10 +310,6 @@ def dispatch_to_series(left, right, func, str_rep=None, axis=None): ------- DataFrame """ - # Note: we use iloc to access columns for compat with cases - # with non-unique columns. - import pandas.core.computation.expressions as expressions - right = lib.item_from_zerodim(right) if lib.is_scalar(right) or np.ndim(right) == 0: @@ -322,19 +318,15 @@ def dispatch_to_series(left, right, func, str_rep=None, axis=None): bm = left._mgr.apply(array_op, right=right) return type(left)(bm) - elif isinstance(right, ABCDataFrame): - assert right._indexed_same(left) + array_op = get_array_op(func, str_rep=str_rep) - array_op = get_array_op(func, str_rep=str_rep) + if isinstance(right, ABCDataFrame): + assert right._indexed_same(left) arrays = [] for l, r in zip(left._iter_column_arrays(), right._iter_column_arrays()): arrays.append(array_op(l, r)) - return type(left)._from_arrays( - arrays, left.columns, left.index, verify_integrity=False - ) - elif isinstance(right, ABCSeries) and axis == "columns": # We only get here if called via _combine_series_frame, # in which case we specifically want to operate row-by-row @@ -345,27 +337,28 @@ def dispatch_to_series(left, right, func, str_rep=None, axis=None): # Note: we do not do this unconditionally as it may be lossy or # expensive for EA dtypes. right = np.asarray(right) - - def column_op(a, b): - return {i: func(a.iloc[:, i], b[i]) for i in range(len(a.columns))} - else: + right = right._values - def column_op(a, b): - return {i: func(a.iloc[:, i], b.iloc[i]) for i in range(len(a.columns))} + arrays = [] + for l, r in zip(left._iter_column_arrays(), right): + arrays.append(array_op(l, r)) elif isinstance(right, ABCSeries): assert right.index.equals(left.index) # Handle other cases later + right = right._values - def column_op(a, b): - return {i: func(a.iloc[:, i], b) for i in range(len(a.columns))} + arrays = [] + for l in left._iter_column_arrays(): + arrays.append(array_op(l, right)) else: # Remaining cases have less-obvious dispatch rules raise NotImplementedError(right) - new_data = expressions.evaluate(column_op, str_rep, left, right) - return new_data + return type(left)._from_arrays( + arrays, left.columns, left.index, verify_integrity=False + ) # ----------------------------------------------------------------------------- From 9ff61c4296f36f4c66becf6d0d04006834bdac5e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 17 Apr 2020 14:53:18 +0200 Subject: [PATCH 3/5] cleanup --- pandas/core/ops/__init__.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index cb692c29dd34d..7e08a164b2afb 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -310,17 +310,15 @@ def dispatch_to_series(left, right, func, str_rep=None, axis=None): ------- DataFrame """ + # Get the appropriate array-op to apply to each column/block's values. + array_op = get_array_op(func, str_rep=str_rep) + right = lib.item_from_zerodim(right) if lib.is_scalar(right) or np.ndim(right) == 0: - - # Get the appropriate array-op to apply to each block's values. - array_op = get_array_op(func, str_rep=str_rep) bm = left._mgr.apply(array_op, right=right) return type(left)(bm) - array_op = get_array_op(func, str_rep=str_rep) - - if isinstance(right, ABCDataFrame): + elif isinstance(right, ABCDataFrame): assert right._indexed_same(left) arrays = [] From b8de50e41c1f095c1ed5bc0ae64dd1336e6d2c49 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 22 May 2020 14:45:24 +0200 Subject: [PATCH 4/5] use list comprehension --- pandas/core/ops/__init__.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 5e1959f9e8291..1a190df7639ab 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -339,17 +339,13 @@ def dispatch_to_series(left, right, func, str_rep=None, axis=None): else: right = right._values - arrays = [] - for l, r in zip(left._iter_column_arrays(), right): - arrays.append() + arrays = [array_op(l, r) for l, r in zip(left._iter_column_arrays(), right)] elif isinstance(right, ABCSeries): assert right.index.equals(left.index) # Handle other cases later right = right._values - arrays = [] - for l in left._iter_column_arrays(): - arrays.append(array_op(l, right)) + arrays = [array_op(l, right) for l in left._iter_column_arrays()] else: # Remaining cases have less-obvious dispatch rules From 103db6c7184748be4957c021d8a1a65c6735e105 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 22 May 2020 20:36:03 +0200 Subject: [PATCH 5/5] fixup --- pandas/core/ops/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 1a190df7639ab..ccf0b11e57fb9 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -322,7 +322,6 @@ def dispatch_to_series(left, right, func, str_rep=None, axis=None): elif isinstance(right, ABCDataFrame): assert right._indexed_same(left) - array_op = get_array_op(func, str_rep=str_rep) bm = operate_blockwise(left, right, array_op) return type(left)(bm)