Skip to content

Commit c1e309b

Browse files
authored
ENH: Implement more efficient merge for arrow strings (pandas-dev#54443)
* ENH: Implement more efficient merge for arrow strings * Fix typing * Update * ENH: Implement more efficient merge for arrow strings
1 parent 2451b42 commit c1e309b

File tree

2 files changed

+29
-2
lines changed

2 files changed

+29
-2
lines changed

doc/source/whatsnew/v2.1.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -591,6 +591,7 @@ Performance improvements
591591
- Performance improvement in :class:`Series` reductions (:issue:`52341`)
592592
- Performance improvement in :func:`concat` when ``axis=1`` and objects have different indexes (:issue:`52541`)
593593
- Performance improvement in :func:`concat` when the concatenation axis is a :class:`MultiIndex` (:issue:`53574`)
594+
- Performance improvement in :func:`merge` for PyArrow backed strings (:issue:`54443`)
594595
- Performance improvement in :func:`read_csv` with ``engine="c"`` (:issue:`52632`)
595596
- Performance improvement in :meth:`.DataFrameGroupBy.groups` (:issue:`53088`)
596597
- Performance improvement in :meth:`DataFrame.astype` when ``dtype`` is an extension dtype (:issue:`54299`)
@@ -611,7 +612,6 @@ Performance improvements
611612
- Performance improvement when doing various reshaping operations on :class:`arrays.IntegerArrays` & :class:`arrays.FloatingArray` by avoiding doing unnecessary validation (:issue:`53013`)
612613
- Performance improvement when indexing with pyarrow timestamp and duration dtypes (:issue:`53368`)
613614
- Performance improvement when passing an array to :meth:`RangeIndex.take`, :meth:`DataFrame.loc`, or :meth:`DataFrame.iloc` and the DataFrame is using a RangeIndex (:issue:`53387`)
614-
-
615615

616616
.. ---------------------------------------------------------------------------
617617
.. _whatsnew_210.bug_fixes:

pandas/core/reshape/merge.py

+28-1
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@
6060
is_number,
6161
is_numeric_dtype,
6262
is_object_dtype,
63+
is_string_dtype,
6364
needs_i8_conversion,
6465
)
6566
from pandas.core.dtypes.dtypes import (
@@ -2401,13 +2402,39 @@ def _factorize_keys(
24012402
if not isinstance(lk, BaseMaskedArray) and not (
24022403
# exclude arrow dtypes that would get cast to object
24032404
isinstance(lk.dtype, ArrowDtype)
2404-
and is_numeric_dtype(lk.dtype.numpy_dtype)
2405+
and (
2406+
is_numeric_dtype(lk.dtype.numpy_dtype)
2407+
or is_string_dtype(lk.dtype)
2408+
and not sort
2409+
)
24052410
):
24062411
lk, _ = lk._values_for_factorize()
24072412

24082413
# error: Item "ndarray" of "Union[Any, ndarray]" has no attribute
24092414
# "_values_for_factorize"
24102415
rk, _ = rk._values_for_factorize() # type: ignore[union-attr]
2416+
elif isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype):
2417+
import pyarrow as pa
2418+
import pyarrow.compute as pc
2419+
2420+
len_lk = len(lk)
2421+
lk = lk._pa_array # type: ignore[attr-defined]
2422+
rk = rk._pa_array # type: ignore[union-attr]
2423+
dc = (
2424+
pa.chunked_array(lk.chunks + rk.chunks) # type: ignore[union-attr]
2425+
.combine_chunks()
2426+
.dictionary_encode()
2427+
)
2428+
length = len(dc.dictionary)
2429+
2430+
llab, rlab, count = (
2431+
pc.fill_null(dc.indices[slice(len_lk)], length).to_numpy(),
2432+
pc.fill_null(dc.indices[slice(len_lk, None)], length).to_numpy(),
2433+
len(dc.dictionary),
2434+
)
2435+
if how == "right":
2436+
return rlab, llab, count
2437+
return llab, rlab, count
24112438

24122439
if needs_i8_conversion(lk.dtype) and lk.dtype == rk.dtype:
24132440
# GH#23917 TODO: Needs tests for non-matching dtypes

0 commit comments

Comments
 (0)