Skip to content

Commit 7b6a5c4

Browse files
Backport PR #43634 on branch 1.3.x (PERF: groupby with string dtype)" (#43724)
1 parent 7c74541 commit 7b6a5c4

File tree

3 files changed

+54
-0
lines changed

3 files changed

+54
-0
lines changed

asv_bench/benchmarks/groupby.py

+32
Original file line numberDiff line numberDiff line change
@@ -568,6 +568,38 @@ def time_sum(self):
568568
self.df.groupby(["a"])["b"].sum()
569569

570570

571+
class String:
572+
# GH#41596
573+
param_names = ["dtype", "method"]
574+
params = [
575+
["str", "string[python]"],
576+
[
577+
"sum",
578+
"prod",
579+
"min",
580+
"max",
581+
"mean",
582+
"median",
583+
"var",
584+
"first",
585+
"last",
586+
"any",
587+
"all",
588+
],
589+
]
590+
591+
def setup(self, dtype, method):
592+
cols = list("abcdefghjkl")
593+
self.df = DataFrame(
594+
np.random.randint(0, 100, size=(1_000_000, len(cols))),
595+
columns=cols,
596+
dtype=dtype,
597+
)
598+
599+
def time_str_func(self, dtype, method):
600+
self.df.groupby("a")[self.df.columns[1:]].agg(method)
601+
602+
571603
class Categories:
572604
def setup(self):
573605
N = 10 ** 5

doc/source/whatsnew/v1.3.4.rst

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ Fixed regressions
1717
- Fixed regression in :meth:`merge` with integer and ``NaN`` keys failing with ``outer`` merge (:issue:`43550`)
1818
- Fixed regression in :meth:`DataFrame.corr` raising ``ValueError`` with ``method="spearman"`` on 32-bit platforms (:issue:`43588`)
1919
- Fixed performance regression in :meth:`MultiIndex.equals` (:issue:`43549`)
20+
- Fixed performance regression in :meth:`.GroupBy.first` and :meth:`.GroupBy.last` with :class:`StringDtype` (:issue:`41596`)
2021
- Fixed regression in :meth:`Series.cat.reorder_categories` failing to update the categories on the ``Series`` (:issue:`43232`)
2122
- Fixed regression in :meth:`Series.cat.categories` setter failing to update the categories on the ``Series`` (:issue:`43334`)
2223
- Fixed regression in :meth:`pandas.read_csv` raising ``UnicodeDecodeError`` exception when ``memory_map=True`` (:issue:`43540`)

pandas/core/groupby/ops.py

+21
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@
8181
BaseMaskedArray,
8282
BaseMaskedDtype,
8383
)
84+
from pandas.core.arrays.string_ import StringDtype
8485
import pandas.core.common as com
8586
from pandas.core.frame import DataFrame
8687
from pandas.core.generic import NDFrame
@@ -427,6 +428,26 @@ def _ea_wrap_cython_operation(
427428
cls = dtype.construct_array_type()
428429
return cls._from_sequence(res_values, dtype=dtype)
429430

431+
elif isinstance(values.dtype, StringDtype):
432+
# StringArray
433+
npvalues = values.to_numpy(object, na_value=np.nan)
434+
res_values = self._cython_op_ndim_compat(
435+
npvalues,
436+
min_count=min_count,
437+
ngroups=ngroups,
438+
comp_ids=comp_ids,
439+
mask=None,
440+
**kwargs,
441+
)
442+
if self.how in ["rank"]:
443+
# i.e. how in WrappedCythonOp.cast_blocklist, since
444+
# other cast_blocklist methods dont go through cython_operation
445+
return res_values
446+
447+
dtype = self._get_result_dtype(orig_values.dtype)
448+
cls = dtype.construct_array_type()
449+
return cls._from_sequence(res_values, dtype=dtype)
450+
430451
raise NotImplementedError(
431452
f"function is not implemented for this dtype: {values.dtype}"
432453
)

0 commit comments

Comments
 (0)