Skip to content

Commit a385f75

Browse files
committed
Merge branch 'master' into styler_bar_colors
2 parents bae65ec + b3e9ae7 commit a385f75

File tree

16 files changed

+371
-44
lines changed

16 files changed

+371
-44
lines changed

asv_bench/benchmarks/groupby.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -603,6 +603,38 @@ def time_sum(self):
603603
self.df.groupby(["a"])["b"].sum()
604604

605605

606+
class String:
607+
# GH#41596
608+
param_names = ["dtype", "method"]
609+
params = [
610+
["str", "string[python]"],
611+
[
612+
"sum",
613+
"prod",
614+
"min",
615+
"max",
616+
"mean",
617+
"median",
618+
"var",
619+
"first",
620+
"last",
621+
"any",
622+
"all",
623+
],
624+
]
625+
626+
def setup(self, dtype, method):
627+
cols = list("abcdefghjkl")
628+
self.df = DataFrame(
629+
np.random.randint(0, 100, size=(1_000_000, len(cols))),
630+
columns=cols,
631+
dtype=dtype,
632+
)
633+
634+
def time_str_func(self, dtype, method):
635+
self.df.groupby("a")[self.df.columns[1:]].agg(method)
636+
637+
606638
class Categories:
607639
def setup(self):
608640
N = 10 ** 5

doc/source/whatsnew/v1.3.4.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,11 @@ Fixed regressions
1717
- Fixed regression in :meth:`merge` with integer and ``NaN`` keys failing with ``outer`` merge (:issue:`43550`)
1818
- Fixed regression in :meth:`DataFrame.corr` raising ``ValueError`` with ``method="spearman"`` on 32-bit platforms (:issue:`43588`)
1919
- Fixed performance regression in :meth:`MultiIndex.equals` (:issue:`43549`)
20+
- Fixed performance regression in :meth:`.GroupBy.first` and :meth:`.GroupBy.last` with :class:`StringDtype` (:issue:`41596`)
2021
- Fixed regression in :meth:`Series.cat.reorder_categories` failing to update the categories on the ``Series`` (:issue:`43232`)
2122
- Fixed regression in :meth:`Series.cat.categories` setter failing to update the categories on the ``Series`` (:issue:`43334`)
2223
- Fixed regression in :meth:`pandas.read_csv` raising ``UnicodeDecodeError`` exception when ``memory_map=True`` (:issue:`43540`)
23-
-
24+
- Fixed regression in :meth:`Series.aggregate` attempting to pass ``args`` and ``kwargs`` multiple times to the user supplied ``func`` in certain cases (:issue:`43357`)
2425

2526
.. ---------------------------------------------------------------------------
2627
@@ -29,6 +30,7 @@ Fixed regressions
2930
Bug fixes
3031
~~~~~~~~~
3132
- Fixed bug in :meth:`.GroupBy.mean` with datetimelike values including ``NaT`` values returning incorrect results (:issue:`43132`)
33+
- Fixed bug in :meth:`Series.aggregate` not passing the first ``args`` to the user supplied ``func`` in certain cases (:issue:`43357`)
3234

3335
.. ---------------------------------------------------------------------------
3436

doc/source/whatsnew/v1.4.0.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,7 @@ Performance improvements
357357
- Performance improvement in :meth:`GroupBy.quantile` (:issue:`43469`)
358358
- :meth:`SparseArray.min` and :meth:`SparseArray.max` no longer require converting to a dense array (:issue:`43526`)
359359
- Performance improvement in :meth:`SparseArray.take` with ``allow_fill=False`` (:issue:`43654`)
360-
-
360+
- Performance improvement in :meth:`.Rolling.mean` and :meth:`.Expanding.mean` with ``engine="numba"`` (:issue:`43612`)
361361

362362
.. ---------------------------------------------------------------------------
363363

pandas/_libs/index.pyx

Lines changed: 32 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -87,11 +87,7 @@ cdef class IndexEngine:
8787
values = self.values
8888

8989
self._check_type(val)
90-
try:
91-
loc = _bin_search(values, val) # .searchsorted(val, side='left')
92-
except TypeError:
93-
# GH#35788 e.g. val=None with float64 values
94-
raise KeyError(val)
90+
loc = self._searchsorted_left(val)
9591
if loc >= len(values):
9692
raise KeyError(val)
9793
if values[loc] != val:
@@ -110,6 +106,17 @@ cdef class IndexEngine:
110106
# GH#41775 OverflowError e.g. if we are uint64 and val is -1
111107
raise KeyError(val)
112108

109+
cdef Py_ssize_t _searchsorted_left(self, val) except? -1:
110+
"""
111+
See ObjectEngine._searchsorted_left.__doc__.
112+
"""
113+
try:
114+
loc = self.values.searchsorted(val, side="left")
115+
except TypeError as err:
116+
# GH#35788 e.g. val=None with float64 values
117+
raise KeyError(val)
118+
return loc
119+
113120
cdef inline _get_loc_duplicates(self, object val):
114121
# -> Py_ssize_t | slice | ndarray[bool]
115122
cdef:
@@ -373,6 +380,11 @@ cdef class IndexEngine:
373380

374381

375382
cdef Py_ssize_t _bin_search(ndarray values, object val) except -1:
383+
# GH#1757 ndarray.searchsorted is not safe to use with array of tuples
384+
# (treats a tuple `val` as a sequence of keys instead of a single key),
385+
# so we implement something similar.
386+
# This is equivalent to the stdlib's bisect.bisect_left
387+
376388
cdef:
377389
Py_ssize_t mid = 0, lo = 0, hi = len(values) - 1
378390
object pval
@@ -405,6 +417,15 @@ cdef class ObjectEngine(IndexEngine):
405417
cdef _make_hash_table(self, Py_ssize_t n):
406418
return _hash.PyObjectHashTable(n)
407419

420+
cdef Py_ssize_t _searchsorted_left(self, val) except? -1:
421+
# using values.searchsorted here would treat a tuple `val` as a sequence
422+
# instead of a single key, so we use a different implementation
423+
try:
424+
loc = _bin_search(self.values, val)
425+
except TypeError as err:
426+
raise KeyError(val) from err
427+
return loc
428+
408429

409430
cdef class DatetimeEngine(Int64Engine):
410431

@@ -418,19 +439,12 @@ cdef class DatetimeEngine(Int64Engine):
418439
def __contains__(self, val: object) -> bool:
419440
# We assume before we get here:
420441
# - val is hashable
421-
cdef:
422-
int64_t loc, conv
423-
424-
conv = self._unbox_scalar(val)
425-
if self.over_size_threshold and self.is_monotonic_increasing:
426-
if not self.is_unique:
427-
return self._get_loc_duplicates(conv)
428-
values = self.values
429-
loc = values.searchsorted(conv, side='left')
430-
return values[loc] == conv
431-
432-
self._ensure_mapping_populated()
433-
return conv in self.mapping
442+
self._unbox_scalar(val)
443+
try:
444+
self.get_loc(val)
445+
return True
446+
except KeyError:
447+
return False
434448

435449
cdef _call_monotonic(self, values):
436450
return algos.is_monotonic(values, timelike=True)

pandas/core/_numba/__init__.py

Whitespace-only changes.

pandas/core/_numba/executor.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
from __future__ import annotations
2+
3+
from typing import Callable
4+
5+
import numpy as np
6+
7+
from pandas._typing import Scalar
8+
from pandas.compat._optional import import_optional_dependency
9+
10+
from pandas.core.util.numba_ import (
11+
NUMBA_FUNC_CACHE,
12+
get_jit_arguments,
13+
)
14+
15+
16+
def generate_shared_aggregator(
17+
func: Callable[..., Scalar],
18+
engine_kwargs: dict[str, bool] | None,
19+
cache_key_str: str,
20+
):
21+
"""
22+
Generate a Numba function that loops over the columns 2D object and applies
23+
a 1D numba kernel over each column.
24+
25+
Parameters
26+
----------
27+
func : function
28+
aggregation function to be applied to each column
29+
engine_kwargs : dict
30+
dictionary of arguments to be passed into numba.jit
31+
cache_key_str: str
32+
string to access the compiled function of the form
33+
<caller_type>_<aggregation_type> e.g. rolling_mean, groupby_mean
34+
35+
Returns
36+
-------
37+
Numba function
38+
"""
39+
nopython, nogil, parallel = get_jit_arguments(engine_kwargs, None)
40+
41+
cache_key = (func, cache_key_str)
42+
if cache_key in NUMBA_FUNC_CACHE:
43+
return NUMBA_FUNC_CACHE[cache_key]
44+
45+
numba = import_optional_dependency("numba")
46+
47+
@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
48+
def column_looper(
49+
values: np.ndarray,
50+
start: np.ndarray,
51+
end: np.ndarray,
52+
min_periods: int,
53+
):
54+
result = np.empty((len(start), values.shape[1]), dtype=np.float64)
55+
for i in numba.prange(values.shape[1]):
56+
result[:, i] = func(values[:, i], start, end, min_periods)
57+
return result
58+
59+
return column_looper
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from pandas.core._numba.kernels.mean_ import sliding_mean
2+
3+
__all__ = ["sliding_mean"]

pandas/core/_numba/kernels/mean_.py

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
"""
2+
Numba 1D aggregation kernels that can be shared by
3+
* Dataframe / Series
4+
* groupby
5+
* rolling / expanding
6+
7+
Mirrors pandas/_libs/window/aggregation.pyx
8+
"""
9+
from __future__ import annotations
10+
11+
import numba
12+
import numpy as np
13+
14+
15+
@numba.jit(nopython=True, nogil=True, parallel=False)
16+
def is_monotonic_increasing(bounds: np.ndarray) -> bool:
17+
"""Check if int64 values are monotonically increasing."""
18+
n = len(bounds)
19+
if n < 2:
20+
return True
21+
prev = bounds[0]
22+
for i in range(1, n):
23+
cur = bounds[i]
24+
if cur < prev:
25+
return False
26+
prev = cur
27+
return True
28+
29+
30+
@numba.jit(nopython=True, nogil=True, parallel=False)
31+
def add_mean(
32+
val: float, nobs: int, sum_x: float, neg_ct: int, compensation: float
33+
) -> tuple[int, float, int, float]:
34+
if not np.isnan(val):
35+
nobs += 1
36+
y = val - compensation
37+
t = sum_x + y
38+
compensation = t - sum_x - y
39+
sum_x = t
40+
if val < 0:
41+
neg_ct += 1
42+
return nobs, sum_x, neg_ct, compensation
43+
44+
45+
@numba.jit(nopython=True, nogil=True, parallel=False)
46+
def remove_mean(
47+
val: float, nobs: int, sum_x: float, neg_ct: int, compensation: float
48+
) -> tuple[int, float, int, float]:
49+
if not np.isnan(val):
50+
nobs -= 1
51+
y = -val - compensation
52+
t = sum_x + y
53+
compensation = t - sum_x - y
54+
sum_x = t
55+
if val < 0:
56+
neg_ct -= 1
57+
return nobs, sum_x, neg_ct, compensation
58+
59+
60+
@numba.jit(nopython=True, nogil=True, parallel=False)
61+
def sliding_mean(
62+
values: np.ndarray,
63+
start: np.ndarray,
64+
end: np.ndarray,
65+
min_periods: int,
66+
) -> np.ndarray:
67+
N = len(start)
68+
nobs = 0
69+
sum_x = 0.0
70+
neg_ct = 0
71+
compensation_add = 0.0
72+
compensation_remove = 0.0
73+
74+
is_monotonic_increasing_bounds = is_monotonic_increasing(
75+
start
76+
) and is_monotonic_increasing(end)
77+
78+
output = np.empty(N, dtype=np.float64)
79+
80+
for i in range(N):
81+
s = start[i]
82+
e = end[i]
83+
if i == 0 or not is_monotonic_increasing_bounds:
84+
for j in range(s, e):
85+
val = values[j]
86+
nobs, sum_x, neg_ct, compensation_add = add_mean(
87+
val, nobs, sum_x, neg_ct, compensation_add
88+
)
89+
else:
90+
for j in range(start[i - 1], s):
91+
val = values[j]
92+
nobs, sum_x, neg_ct, compensation_remove = remove_mean(
93+
val, nobs, sum_x, neg_ct, compensation_remove
94+
)
95+
96+
for j in range(end[i - 1], e):
97+
val = values[j]
98+
nobs, sum_x, neg_ct, compensation_add = add_mean(
99+
val, nobs, sum_x, neg_ct, compensation_add
100+
)
101+
102+
if nobs >= min_periods and nobs > 0:
103+
result = sum_x / nobs
104+
if neg_ct == 0 and result < 0:
105+
result = 0
106+
elif neg_ct == nobs and result > 0:
107+
result = 0
108+
else:
109+
result = np.nan
110+
111+
output[i] = result
112+
113+
if not is_monotonic_increasing_bounds:
114+
nobs = 0
115+
sum_x = 0.0
116+
neg_ct = 0
117+
compensation_remove = 0.0
118+
119+
return output

pandas/core/apply.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1051,7 +1051,6 @@ def agg(self):
10511051
result = super().agg()
10521052
if result is None:
10531053
f = self.f
1054-
args = self.args
10551054
kwargs = self.kwargs
10561055

10571056
# string, list-like, and dict-like are entirely handled in super
@@ -1070,9 +1069,9 @@ def agg(self):
10701069
# then .agg and .apply would have different semantics if the
10711070
# operation is actually defined on the Series, e.g. str
10721071
try:
1073-
result = self.obj.apply(f, *args, **kwargs)
1072+
result = self.obj.apply(f)
10741073
except (ValueError, AttributeError, TypeError):
1075-
result = f(self.obj, *args, **kwargs)
1074+
result = f(self.obj)
10761075

10771076
return result
10781077

pandas/core/generic.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3953,6 +3953,8 @@ def __delitem__(self, key) -> None:
39533953
maybe_shortcut = False
39543954
if self.ndim == 2 and isinstance(self.columns, MultiIndex):
39553955
try:
3956+
# By using engine's __contains__ we effectively
3957+
# restrict to same-length tuples
39563958
maybe_shortcut = key not in self.columns._engine
39573959
except TypeError:
39583960
pass

pandas/core/groupby/ops.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@
8282
BaseMaskedArray,
8383
BaseMaskedDtype,
8484
)
85+
from pandas.core.arrays.string_ import StringDtype
8586
from pandas.core.frame import DataFrame
8687
from pandas.core.generic import NDFrame
8788
from pandas.core.groupby import grouper
@@ -348,6 +349,9 @@ def _ea_wrap_cython_operation(
348349
elif isinstance(values.dtype, FloatingDtype):
349350
# FloatingArray
350351
npvalues = values.to_numpy(values.dtype.numpy_dtype, na_value=np.nan)
352+
elif isinstance(values.dtype, StringDtype):
353+
# StringArray
354+
npvalues = values.to_numpy(object, na_value=np.nan)
351355
else:
352356
raise NotImplementedError(
353357
f"function is not implemented for this dtype: {values.dtype}"
@@ -375,7 +379,9 @@ def _reconstruct_ea_result(self, values, res_values):
375379
"""
376380
# TODO: allow EAs to override this logic
377381

378-
if isinstance(values.dtype, (BooleanDtype, _IntegerDtype, FloatingDtype)):
382+
if isinstance(
383+
values.dtype, (BooleanDtype, _IntegerDtype, FloatingDtype, StringDtype)
384+
):
379385
dtype = self._get_result_dtype(values.dtype)
380386
cls = dtype.construct_array_type()
381387
return cls._from_sequence(res_values, dtype=dtype)

0 commit comments

Comments
 (0)