Skip to content

Commit e98f9b7

Browse files
jbrockmendelAlexKirko
authored andcommitted
PERF: implement scalar ops blockwise (pandas-dev#29853)
1 parent 9a8f8a6 commit e98f9b7

File tree

11 files changed

+124
-24
lines changed

11 files changed

+124
-24
lines changed

asv_bench/benchmarks/binary_ops.py

+32
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import operator
2+
13
import numpy as np
24

35
from pandas import DataFrame, Series, date_range
@@ -9,6 +11,36 @@
911
import pandas.computation.expressions as expr
1012

1113

14+
class IntFrameWithScalar:
15+
params = [
16+
[np.float64, np.int64],
17+
[2, 3.0, np.int32(4), np.float64(5)],
18+
[
19+
operator.add,
20+
operator.sub,
21+
operator.mul,
22+
operator.truediv,
23+
operator.floordiv,
24+
operator.pow,
25+
operator.mod,
26+
operator.eq,
27+
operator.ne,
28+
operator.gt,
29+
operator.ge,
30+
operator.lt,
31+
operator.le,
32+
],
33+
]
34+
param_names = ["dtype", "scalar", "op"]
35+
36+
def setup(self, dtype, scalar, op):
37+
arr = np.random.randn(20000, 100)
38+
self.df = DataFrame(arr.astype(dtype))
39+
40+
def time_frame_op_with_scalar(self, dtype, scalar, op):
41+
op(self.df, scalar)
42+
43+
1244
class Ops:
1345

1446
params = [[True, False], ["default", 1]]

doc/source/whatsnew/v1.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -672,6 +672,7 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more.
672672
Performance improvements
673673
~~~~~~~~~~~~~~~~~~~~~~~~
674674

675+
- Performance improvement in :class:`DataFrame` arithmetic and comparison operations with scalars (:issue:`24990`, :issue:`29853`)
675676
- Performance improvement in indexing with a non-unique :class:`IntervalIndex` (:issue:`27489`)
676677
- Performance improvement in :attr:`MultiIndex.is_monotonic` (:issue:`27495`)
677678
- Performance improvement in :func:`cut` when ``bins`` is an :class:`IntervalIndex` (:issue:`27668`)

pandas/core/arrays/datetimelike.py

+27-4
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,24 @@ class DatetimeLikeArrayMixin(ExtensionOpsMixin, AttributesMixin, ExtensionArray)
325325
_generate_range
326326
"""
327327

328+
@property
329+
def ndim(self) -> int:
330+
return self._data.ndim
331+
332+
@property
333+
def shape(self):
334+
return self._data.shape
335+
336+
def reshape(self, *args, **kwargs):
337+
# Note: we drop any freq
338+
data = self._data.reshape(*args, **kwargs)
339+
return type(self)(data, dtype=self.dtype)
340+
341+
def ravel(self, *args, **kwargs):
342+
# Note: we drop any freq
343+
data = self._data.ravel(*args, **kwargs)
344+
return type(self)(data, dtype=self.dtype)
345+
328346
@property
329347
def _box_func(self):
330348
"""
@@ -413,7 +431,10 @@ def __getitem__(self, key):
413431
getitem = self._data.__getitem__
414432
if is_int:
415433
val = getitem(key)
416-
return self._box_func(val)
434+
if lib.is_scalar(val):
435+
# i.e. self.ndim == 1
436+
return self._box_func(val)
437+
return type(self)(val, dtype=self.dtype)
417438

418439
if com.is_bool_indexer(key):
419440
key = np.asarray(key, dtype=bool)
@@ -823,6 +844,8 @@ def inferred_freq(self):
823844
generated by infer_freq. Returns None if it can't autodetect the
824845
frequency.
825846
"""
847+
if self.ndim != 1:
848+
return None
826849
try:
827850
return frequencies.infer_freq(self)
828851
except ValueError:
@@ -968,7 +991,7 @@ def _add_timedeltalike_scalar(self, other):
968991
"""
969992
if isna(other):
970993
# i.e np.timedelta64("NaT"), not recognized by delta_to_nanoseconds
971-
new_values = np.empty(len(self), dtype="i8")
994+
new_values = np.empty(self.shape, dtype="i8")
972995
new_values[:] = iNaT
973996
return new_values
974997

@@ -1014,7 +1037,7 @@ def _add_nat(self):
10141037

10151038
# GH#19124 pd.NaT is treated like a timedelta for both timedelta
10161039
# and datetime dtypes
1017-
result = np.zeros(len(self), dtype=np.int64)
1040+
result = np.zeros(self.shape, dtype=np.int64)
10181041
result.fill(iNaT)
10191042
return type(self)(result, dtype=self.dtype, freq=None)
10201043

@@ -1028,7 +1051,7 @@ def _sub_nat(self):
10281051
# For datetime64 dtypes by convention we treat NaT as a datetime, so
10291052
# this subtraction returns a timedelta64 dtype.
10301053
# For period dtype, timedelta64 is a close-enough return dtype.
1031-
result = np.zeros(len(self), dtype=np.int64)
1054+
result = np.zeros(self.shape, dtype=np.int64)
10321055
result.fill(iNaT)
10331056
return result.view("timedelta64[ns]")
10341057

pandas/core/arrays/datetimes.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -339,7 +339,7 @@ def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False):
339339
" those."
340340
)
341341
raise ValueError(msg)
342-
if values.ndim != 1:
342+
if values.ndim not in [1, 2]:
343343
raise ValueError("Only 1-dimensional input arrays are supported.")
344344

345345
if values.dtype == "i8":
@@ -788,6 +788,9 @@ def _sub_datetime_arraylike(self, other):
788788
return new_values.view("timedelta64[ns]")
789789

790790
def _add_offset(self, offset):
791+
if self.ndim == 2:
792+
return self.ravel()._add_offset(offset).reshape(self.shape)
793+
791794
assert not isinstance(offset, Tick)
792795
try:
793796
if self.tz is not None:

pandas/core/arrays/timedeltas.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ def __init__(self, values, dtype=_TD_DTYPE, freq=None, copy=False):
217217
" TimedeltaArray ndarray, or Series or Index containing one of those."
218218
)
219219
raise ValueError(msg)
220-
if values.ndim != 1:
220+
if values.ndim not in [1, 2]:
221221
raise ValueError("Only 1-dimensional input arrays are supported.")
222222

223223
if values.dtype == "i8":
@@ -1036,8 +1036,6 @@ def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"):
10361036
raise TypeError(f"dtype {data.dtype} cannot be converted to timedelta64[ns]")
10371037

10381038
data = np.array(data, copy=copy)
1039-
if data.ndim != 1:
1040-
raise ValueError("Only 1-dimensional input arrays are supported.")
10411039

10421040
assert data.dtype == "m8[ns]", data
10431041
return data, inferred_freq

pandas/core/internals/blocks.py

+12
Original file line numberDiff line numberDiff line change
@@ -368,7 +368,19 @@ def apply(self, func, **kwargs):
368368
"""
369369
with np.errstate(all="ignore"):
370370
result = func(self.values, **kwargs)
371+
372+
if is_extension_array_dtype(result) and result.ndim > 1:
373+
# if we get a 2D ExtensionArray, we need to split it into 1D pieces
374+
nbs = []
375+
for i, loc in enumerate(self.mgr_locs):
376+
vals = result[i]
377+
nv = _block_shape(vals, ndim=self.ndim)
378+
block = self.make_block(values=nv, placement=[loc])
379+
nbs.append(block)
380+
return nbs
381+
371382
if not isinstance(result, Block):
383+
# Exclude the 0-dim case so we can do reductions
372384
result = self.make_block(values=_block_shape(result, ndim=self.ndim))
373385

374386
return result

pandas/core/internals/managers.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -340,13 +340,13 @@ def _verify_integrity(self):
340340
f"tot_items: {tot_items}"
341341
)
342342

343-
def apply(self, f: str, filter=None, **kwargs):
343+
def apply(self, f, filter=None, **kwargs):
344344
"""
345345
Iterate over the blocks, collect and create a new BlockManager.
346346
347347
Parameters
348348
----------
349-
f : str
349+
f : str or callable
350350
Name of the Block method to apply.
351351
filter : list, if supplied, only call the block if the filter is in
352352
the block
@@ -411,7 +411,10 @@ def apply(self, f: str, filter=None, **kwargs):
411411
axis = obj._info_axis_number
412412
kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy)
413413

414-
applied = getattr(b, f)(**kwargs)
414+
if callable(f):
415+
applied = b.apply(f, **kwargs)
416+
else:
417+
applied = getattr(b, f)(**kwargs)
415418
result_blocks = _extend_blocks(applied, result_blocks)
416419

417420
if len(result_blocks) == 0:

pandas/core/ops/__init__.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
arithmetic_op,
2727
comparison_op,
2828
define_na_arithmetic_op,
29+
get_array_op,
2930
logical_op,
3031
)
3132
from pandas.core.ops.array_ops import comp_method_OBJECT_ARRAY # noqa:F401
@@ -372,8 +373,10 @@ def dispatch_to_series(left, right, func, str_rep=None, axis=None):
372373
right = lib.item_from_zerodim(right)
373374
if lib.is_scalar(right) or np.ndim(right) == 0:
374375

375-
def column_op(a, b):
376-
return {i: func(a.iloc[:, i], b) for i in range(len(a.columns))}
376+
# Get the appropriate array-op to apply to each block's values.
377+
array_op = get_array_op(func, str_rep=str_rep)
378+
bm = left._data.apply(array_op, right=right)
379+
return type(left)(bm)
377380

378381
elif isinstance(right, ABCDataFrame):
379382
assert right._indexed_same(left)
@@ -713,7 +716,7 @@ def f(self, other, axis=default_axis, level=None, fill_value=None):
713716
if fill_value is not None:
714717
self = self.fillna(fill_value)
715718

716-
new_data = dispatch_to_series(self, other, op)
719+
new_data = dispatch_to_series(self, other, op, str_rep)
717720
return self._construct_result(new_data)
718721

719722
f.__name__ = op_name

pandas/core/ops/array_ops.py

+31-6
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@
22
Functions for arithmetic and comparison operations on NumPy arrays and
33
ExtensionArrays.
44
"""
5+
from functools import partial
56
import operator
6-
from typing import Any, Union
7+
from typing import Any, Optional, Union
78

89
import numpy as np
910

@@ -51,10 +52,10 @@ def comp_method_OBJECT_ARRAY(op, x, y):
5152
if isinstance(y, (ABCSeries, ABCIndex)):
5253
y = y.values
5354

54-
result = libops.vec_compare(x, y, op)
55+
result = libops.vec_compare(x.ravel(), y, op)
5556
else:
56-
result = libops.scalar_compare(x, y, op)
57-
return result
57+
result = libops.scalar_compare(x.ravel(), y, op)
58+
return result.reshape(x.shape)
5859

5960

6061
def masked_arith_op(x, y, op):
@@ -237,9 +238,9 @@ def comparison_op(
237238
elif is_scalar(rvalues) and isna(rvalues):
238239
# numpy does not like comparisons vs None
239240
if op is operator.ne:
240-
res_values = np.ones(len(lvalues), dtype=bool)
241+
res_values = np.ones(lvalues.shape, dtype=bool)
241242
else:
242-
res_values = np.zeros(len(lvalues), dtype=bool)
243+
res_values = np.zeros(lvalues.shape, dtype=bool)
243244

244245
elif is_object_dtype(lvalues.dtype):
245246
res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues)
@@ -367,3 +368,27 @@ def fill_bool(x, left=None):
367368
res_values = filler(res_values) # type: ignore
368369

369370
return res_values
371+
372+
373+
def get_array_op(op, str_rep: Optional[str] = None):
374+
"""
375+
Return a binary array operation corresponding to the given operator op.
376+
377+
Parameters
378+
----------
379+
op : function
380+
Binary operator from operator or roperator module.
381+
str_rep : str or None, default None
382+
str_rep to pass to arithmetic_op
383+
384+
Returns
385+
-------
386+
function
387+
"""
388+
op_name = op.__name__.strip("_")
389+
if op_name in {"eq", "ne", "lt", "le", "gt", "ge"}:
390+
return partial(comparison_op, op=op)
391+
elif op_name in {"and", "or", "xor", "rand", "ror", "rxor"}:
392+
return partial(logical_op, op=op)
393+
else:
394+
return partial(arithmetic_op, op=op, str_rep=str_rep)

pandas/tests/arrays/test_datetimes.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@ def test_only_1dim_accepted(self):
2424
arr = np.array([0, 1, 2, 3], dtype="M8[h]").astype("M8[ns]")
2525

2626
with pytest.raises(ValueError, match="Only 1-dimensional"):
27-
# 2-dim
28-
DatetimeArray(arr.reshape(2, 2))
27+
# 3-dim, we allow 2D to sneak in for ops purposes GH#29853
28+
DatetimeArray(arr.reshape(2, 2, 1))
2929

3030
with pytest.raises(ValueError, match="Only 1-dimensional"):
3131
# 0-dim

pandas/tests/arrays/test_timedeltas.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ def test_only_1dim_accepted(self):
1212
arr = np.array([0, 1, 2, 3], dtype="m8[h]").astype("m8[ns]")
1313

1414
with pytest.raises(ValueError, match="Only 1-dimensional"):
15-
# 2-dim
16-
TimedeltaArray(arr.reshape(2, 2))
15+
# 3-dim, we allow 2D to sneak in for ops purposes GH#29853
16+
TimedeltaArray(arr.reshape(2, 2, 1))
1717

1818
with pytest.raises(ValueError, match="Only 1-dimensional"):
1919
# 0-dim

0 commit comments

Comments
 (0)