Skip to content

Commit 88b78f6

Browse files
jbrockmendeltopper-123
authored andcommitted
PERF: Series.any (pandas-dev#52341)
* PERF: Series.any * optimize * remove unnecessary np.errstate
1 parent 8068a18 commit 88b78f6

File tree

3 files changed

+60
-68
lines changed

3 files changed

+60
-68
lines changed

doc/source/whatsnew/v2.1.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,8 @@ Performance improvements
149149
- Performance improvement in :meth:`Series.combine_first` (:issue:`51777`)
150150
- Performance improvement in :meth:`MultiIndex.set_levels` and :meth:`MultiIndex.set_codes` when ``verify_integrity=True`` (:issue:`51873`)
151151
- Performance improvement in :func:`factorize` for object columns not containing strings (:issue:`51921`)
152+
- Performance improvement in :class:`Series` reductions (:issue:`52341`)
153+
-
152154

153155
.. ---------------------------------------------------------------------------
154156
.. _whatsnew_210.bug_fixes:

pandas/core/dtypes/common.py

+1-5
Original file line numberDiff line numberDiff line change
@@ -1126,13 +1126,9 @@ def needs_i8_conversion(dtype: DtypeObj | None) -> bool:
11261126
>>> needs_i8_conversion(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern").dtype)
11271127
True
11281128
"""
1129-
if dtype is None:
1130-
return False
11311129
if isinstance(dtype, np.dtype):
11321130
return dtype.kind in "mM"
1133-
elif isinstance(dtype, ExtensionDtype):
1134-
return isinstance(dtype, (PeriodDtype, DatetimeTZDtype))
1135-
return False
1131+
return isinstance(dtype, (PeriodDtype, DatetimeTZDtype))
11361132

11371133

11381134
def is_numeric_dtype(arr_or_dtype) -> bool:

pandas/core/nanops.py

+57-63
Original file line numberDiff line numberDiff line change
@@ -38,15 +38,12 @@
3838
is_any_int_dtype,
3939
is_bool_dtype,
4040
is_complex,
41-
is_datetime64_any_dtype,
4241
is_float,
4342
is_float_dtype,
4443
is_integer,
45-
is_integer_dtype,
4644
is_numeric_dtype,
4745
is_object_dtype,
4846
is_scalar,
49-
is_timedelta64_dtype,
5047
needs_i8_conversion,
5148
pandas_dtype,
5249
)
@@ -161,7 +158,7 @@ def f(
161158

162159
def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool:
163160
# Bottleneck chokes on datetime64, PeriodDtype (or and EA)
164-
if not is_object_dtype(dtype) and not needs_i8_conversion(dtype):
161+
if dtype != object and not needs_i8_conversion(dtype):
165162
# GH 42878
166163
# Bottleneck uses naive summation leading to O(n) loss of precision
167164
# unlike numpy which implements pairwise summation, which has O(log(n)) loss
@@ -248,7 +245,7 @@ def _maybe_get_mask(
248245
Optional[np.ndarray[bool]]
249246
"""
250247
if mask is None:
251-
if is_bool_dtype(values.dtype) or is_integer_dtype(values.dtype):
248+
if values.dtype.kind in "biu":
252249
# Boolean data cannot contain nulls, so signal via mask being None
253250
return None
254251

@@ -264,7 +261,7 @@ def _get_values(
264261
fill_value: Any = None,
265262
fill_value_typ: str | None = None,
266263
mask: npt.NDArray[np.bool_] | None = None,
267-
) -> tuple[np.ndarray, npt.NDArray[np.bool_] | None, np.dtype, np.dtype, Any]:
264+
) -> tuple[np.ndarray, npt.NDArray[np.bool_] | None]:
268265
"""
269266
Utility to get the values view, mask, dtype, dtype_max, and fill_value.
270267
@@ -294,12 +291,6 @@ def _get_values(
294291
Potential copy of input value array
295292
mask : Optional[ndarray[bool]]
296293
Mask for values, if deemed necessary to compute
297-
dtype : np.dtype
298-
dtype for values
299-
dtype_max : np.dtype
300-
platform independent dtype
301-
fill_value : Any
302-
fill value used
303294
"""
304295
# In _get_values is only called from within nanops, and in all cases
305296
# with scalar fill_value. This guarantee is important for the
@@ -317,31 +308,33 @@ def _get_values(
317308
values = np.asarray(values.view("i8"))
318309
datetimelike = True
319310

320-
dtype_ok = _na_ok_dtype(dtype)
311+
if skipna and (mask is not None):
312+
# get our fill value (in case we need to provide an alternative
313+
# dtype for it)
314+
fill_value = _get_fill_value(
315+
dtype, fill_value=fill_value, fill_value_typ=fill_value_typ
316+
)
321317

322-
# get our fill value (in case we need to provide an alternative
323-
# dtype for it)
324-
fill_value = _get_fill_value(
325-
dtype, fill_value=fill_value, fill_value_typ=fill_value_typ
326-
)
318+
if fill_value is not None:
319+
if mask.any():
320+
if datetimelike or _na_ok_dtype(dtype):
321+
values = values.copy()
322+
np.putmask(values, mask, fill_value)
323+
else:
324+
# np.where will promote if needed
325+
values = np.where(~mask, values, fill_value)
326+
327+
return values, mask
327328

328-
if skipna and (mask is not None) and (fill_value is not None):
329-
if mask.any():
330-
if dtype_ok or datetimelike:
331-
values = values.copy()
332-
np.putmask(values, mask, fill_value)
333-
else:
334-
# np.where will promote if needed
335-
values = np.where(~mask, values, fill_value)
336329

330+
def _get_dtype_max(dtype: np.dtype) -> np.dtype:
337331
# return a platform independent precision dtype
338332
dtype_max = dtype
339-
if is_integer_dtype(dtype) or is_bool_dtype(dtype):
333+
if dtype.kind in "biu":
340334
dtype_max = np.dtype(np.int64)
341-
elif is_float_dtype(dtype):
335+
elif dtype.kind == "f":
342336
dtype_max = np.dtype(np.float64)
343-
344-
return values, mask, dtype, dtype_max, fill_value
337+
return dtype_max
345338

346339

347340
def _na_ok_dtype(dtype: DtypeObj) -> bool:
@@ -355,7 +348,7 @@ def _wrap_results(result, dtype: np.dtype, fill_value=None):
355348
if result is NaT:
356349
pass
357350

358-
elif is_datetime64_any_dtype(dtype):
351+
elif dtype.kind == "M":
359352
if fill_value is None:
360353
# GH#24293
361354
fill_value = iNaT
@@ -373,7 +366,7 @@ def _wrap_results(result, dtype: np.dtype, fill_value=None):
373366
else:
374367
# If we have float dtype, taking a view will give the wrong result
375368
result = result.astype(dtype)
376-
elif is_timedelta64_dtype(dtype):
369+
elif dtype.kind == "m":
377370
if not isinstance(result, np.ndarray):
378371
if result == fill_value or np.isnan(result):
379372
result = np.timedelta64("NaT").astype(dtype)
@@ -442,7 +435,7 @@ def _na_for_min_count(values: np.ndarray, axis: AxisInt | None) -> Scalar | np.n
442435
For 2-D values, returns a 1-D array where each element is missing.
443436
"""
444437
# we either return np.nan or pd.NaT
445-
if is_numeric_dtype(values):
438+
if is_numeric_dtype(values.dtype):
446439
values = values.astype("float64")
447440
fill_value = na_value_for_dtype(values.dtype)
448441

@@ -533,11 +526,11 @@ def nanany(
533526
stacklevel=find_stack_level(),
534527
)
535528

536-
values, _, _, _, _ = _get_values(values, skipna, fill_value=False, mask=mask)
529+
values, _ = _get_values(values, skipna, fill_value=False, mask=mask)
537530

538531
# For object type, any won't necessarily return
539532
# boolean values (numpy/numpy#4352)
540-
if is_object_dtype(values):
533+
if values.dtype == object:
541534
values = values.astype(bool)
542535

543536
# error: Incompatible return value type (got "Union[bool_, ndarray]", expected
@@ -588,11 +581,11 @@ def nanall(
588581
stacklevel=find_stack_level(),
589582
)
590583

591-
values, _, _, _, _ = _get_values(values, skipna, fill_value=True, mask=mask)
584+
values, _ = _get_values(values, skipna, fill_value=True, mask=mask)
592585

593586
# For object type, all won't necessarily return
594587
# boolean values (numpy/numpy#4352)
595-
if is_object_dtype(values):
588+
if values.dtype == object:
596589
values = values.astype(bool)
597590

598591
# error: Incompatible return value type (got "Union[bool_, ndarray]", expected
@@ -634,13 +627,12 @@ def nansum(
634627
>>> nanops.nansum(s.values)
635628
3.0
636629
"""
637-
values, mask, dtype, dtype_max, _ = _get_values(
638-
values, skipna, fill_value=0, mask=mask
639-
)
640-
dtype_sum = dtype_max
641-
if is_float_dtype(dtype):
630+
dtype = values.dtype
631+
values, mask = _get_values(values, skipna, fill_value=0, mask=mask)
632+
dtype_sum = _get_dtype_max(dtype)
633+
if dtype.kind == "f":
642634
dtype_sum = dtype
643-
elif is_timedelta64_dtype(dtype):
635+
elif dtype.kind == "m":
644636
dtype_sum = np.dtype(np.float64)
645637

646638
the_sum = values.sum(axis, dtype=dtype_sum)
@@ -702,18 +694,17 @@ def nanmean(
702694
>>> nanops.nanmean(s.values)
703695
1.5
704696
"""
705-
values, mask, dtype, dtype_max, _ = _get_values(
706-
values, skipna, fill_value=0, mask=mask
707-
)
708-
dtype_sum = dtype_max
697+
dtype = values.dtype
698+
values, mask = _get_values(values, skipna, fill_value=0, mask=mask)
699+
dtype_sum = _get_dtype_max(dtype)
709700
dtype_count = np.dtype(np.float64)
710701

711702
# not using needs_i8_conversion because that includes period
712-
if dtype.kind in ["m", "M"]:
703+
if dtype.kind in "mM":
713704
dtype_sum = np.dtype(np.float64)
714-
elif is_integer_dtype(dtype):
705+
elif dtype.kind in "iu":
715706
dtype_sum = np.dtype(np.float64)
716-
elif is_float_dtype(dtype):
707+
elif dtype.kind == "f":
717708
dtype_sum = dtype
718709
dtype_count = dtype
719710

@@ -774,8 +765,9 @@ def get_median(x, _mask=None):
774765
res = np.nanmedian(x[_mask])
775766
return res
776767

777-
values, mask, dtype, _, _ = _get_values(values, skipna, mask=mask, fill_value=0)
778-
if not is_float_dtype(values.dtype):
768+
dtype = values.dtype
769+
values, mask = _get_values(values, skipna, mask=mask, fill_value=0)
770+
if values.dtype.kind != "f":
779771
try:
780772
values = values.astype("f8")
781773
except ValueError as err:
@@ -929,7 +921,7 @@ def nanstd(
929921
values = values.view("m8[ns]")
930922

931923
orig_dtype = values.dtype
932-
values, mask, _, _, _ = _get_values(values, skipna, mask=mask)
924+
values, mask = _get_values(values, skipna, mask=mask)
933925

934926
result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask))
935927
return _wrap_results(result, orig_dtype)
@@ -1051,7 +1043,7 @@ def nansem(
10511043
nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask)
10521044

10531045
mask = _maybe_get_mask(values, skipna, mask)
1054-
if not is_float_dtype(values.dtype):
1046+
if values.dtype.kind != "f":
10551047
values = values.astype("f8")
10561048

10571049
if not skipna and mask is not None and mask.any():
@@ -1073,11 +1065,13 @@ def reduction(
10731065
skipna: bool = True,
10741066
mask: npt.NDArray[np.bool_] | None = None,
10751067
) -> Dtype:
1076-
values, mask, dtype, dtype_max, fill_value = _get_values(
1068+
dtype = values.dtype
1069+
values, mask = _get_values(
10771070
values, skipna, fill_value_typ=fill_value_typ, mask=mask
10781071
)
10791072

10801073
if (axis is not None and values.shape[axis] == 0) or values.size == 0:
1074+
dtype_max = _get_dtype_max(dtype)
10811075
try:
10821076
result = getattr(values, meth)(axis, dtype=dtype_max)
10831077
result.fill(np.nan)
@@ -1135,7 +1129,7 @@ def nanargmax(
11351129
>>> nanops.nanargmax(arr, axis=1)
11361130
array([2, 2, 1, 1])
11371131
"""
1138-
values, mask, _, _, _ = _get_values(values, True, fill_value_typ="-inf", mask=mask)
1132+
values, mask = _get_values(values, True, fill_value_typ="-inf", mask=mask)
11391133
# error: Need type annotation for 'result'
11401134
result = values.argmax(axis) # type: ignore[var-annotated]
11411135
result = _maybe_arg_null_out(result, axis, mask, skipna)
@@ -1181,7 +1175,7 @@ def nanargmin(
11811175
>>> nanops.nanargmin(arr, axis=1)
11821176
array([0, 0, 1, 1])
11831177
"""
1184-
values, mask, _, _, _ = _get_values(values, True, fill_value_typ="+inf", mask=mask)
1178+
values, mask = _get_values(values, True, fill_value_typ="+inf", mask=mask)
11851179
# error: Need type annotation for 'result'
11861180
result = values.argmin(axis) # type: ignore[var-annotated]
11871181
result = _maybe_arg_null_out(result, axis, mask, skipna)
@@ -1226,7 +1220,7 @@ def nanskew(
12261220
1.7320508075688787
12271221
"""
12281222
mask = _maybe_get_mask(values, skipna, mask)
1229-
if not is_float_dtype(values.dtype):
1223+
if values.dtype.kind != "f":
12301224
values = values.astype("f8")
12311225
count = _get_counts(values.shape, mask, axis)
12321226
else:
@@ -1262,7 +1256,7 @@ def nanskew(
12621256
result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2**1.5)
12631257

12641258
dtype = values.dtype
1265-
if is_float_dtype(dtype):
1259+
if dtype.kind == "f":
12661260
result = result.astype(dtype, copy=False)
12671261

12681262
if isinstance(result, np.ndarray):
@@ -1314,7 +1308,7 @@ def nankurt(
13141308
-1.2892561983471076
13151309
"""
13161310
mask = _maybe_get_mask(values, skipna, mask)
1317-
if not is_float_dtype(values.dtype):
1311+
if values.dtype.kind != "f":
13181312
values = values.astype("f8")
13191313
count = _get_counts(values.shape, mask, axis)
13201314
else:
@@ -1363,7 +1357,7 @@ def nankurt(
13631357
result = numerator / denominator - adj
13641358

13651359
dtype = values.dtype
1366-
if is_float_dtype(dtype):
1360+
if dtype.kind == "f":
13671361
result = result.astype(dtype, copy=False)
13681362

13691363
if isinstance(result, np.ndarray):
@@ -1667,9 +1661,9 @@ def nancov(
16671661

16681662
def _ensure_numeric(x):
16691663
if isinstance(x, np.ndarray):
1670-
if is_integer_dtype(x) or is_bool_dtype(x):
1664+
if x.dtype.kind in "biu":
16711665
x = x.astype(np.float64)
1672-
elif is_object_dtype(x):
1666+
elif x.dtype == object:
16731667
try:
16741668
x = x.astype(np.complex128)
16751669
except (TypeError, ValueError):

0 commit comments

Comments
 (0)