Skip to content

Commit 7103dd9

Browse files
committed
Backport PR pandas-dev#43634: PERF: groupby with string dtype
1 parent 883cd51 commit 7103dd9

File tree

3 files changed

+37
-0
lines changed

3 files changed

+37
-0
lines changed

asv_bench/benchmarks/groupby.py

+32
Original file line numberDiff line numberDiff line change
@@ -568,6 +568,38 @@ def time_sum(self):
568568
self.df.groupby(["a"])["b"].sum()
569569

570570

571+
class String:
572+
# GH#41596
573+
param_names = ["dtype", "method"]
574+
params = [
575+
["str", "string[python]"],
576+
[
577+
"sum",
578+
"prod",
579+
"min",
580+
"max",
581+
"mean",
582+
"median",
583+
"var",
584+
"first",
585+
"last",
586+
"any",
587+
"all",
588+
],
589+
]
590+
591+
def setup(self, dtype, method):
592+
cols = list("abcdefghjkl")
593+
self.df = DataFrame(
594+
np.random.randint(0, 100, size=(1_000_000, len(cols))),
595+
columns=cols,
596+
dtype=dtype,
597+
)
598+
599+
def time_str_func(self, dtype, method):
600+
self.df.groupby("a")[self.df.columns[1:]].agg(method)
601+
602+
571603
class Categories:
572604
def setup(self):
573605
N = 10 ** 5

doc/source/whatsnew/v1.3.4.rst

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ Fixed regressions
1717
- Fixed regression in :meth:`merge` with integer and ``NaN`` keys failing with ``outer`` merge (:issue:`43550`)
1818
- Fixed regression in :meth:`DataFrame.corr` raising ``ValueError`` with ``method="spearman"`` on 32-bit platforms (:issue:`43588`)
1919
- Fixed performance regression in :meth:`MultiIndex.equals` (:issue:`43549`)
20+
- Fixed performance regression in :meth:`.GroupBy.first` and :meth:`.GroupBy.last` with :class:`StringDtype` (:issue:`41596`)
2021
- Fixed regression in :meth:`Series.cat.reorder_categories` failing to update the categories on the ``Series`` (:issue:`43232`)
2122
- Fixed regression in :meth:`Series.cat.categories` setter failing to update the categories on the ``Series`` (:issue:`43334`)
2223
- Fixed regression in :meth:`pandas.read_csv` raising ``UnicodeDecodeError`` exception when ``memory_map=True`` (:issue:`43540`)

pandas/core/groupby/ops.py

+4
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@
8181
BaseMaskedArray,
8282
BaseMaskedDtype,
8383
)
84+
from pandas.core.arrays.string_ import StringDtype
8485
import pandas.core.common as com
8586
from pandas.core.frame import DataFrame
8687
from pandas.core.generic import NDFrame
@@ -395,6 +396,9 @@ def _ea_wrap_cython_operation(
395396
mask=None,
396397
**kwargs,
397398
)
399+
elif isinstance(values.dtype, StringDtype):
400+
# StringArray
401+
npvalues = values.to_numpy(object, na_value=np.nan)
398402
if self.how in ["rank"]:
399403
# i.e. how in WrappedCythonOp.cast_blocklist, since
400404
# other cast_blocklist methods dont go through cython_operation

0 commit comments

Comments
 (0)