ENH: Implement more efficient merge for arrow strings (pandas-dev#54443)

phofl · web-flow · commit c1e309baec8c · 2023-08-08T16:46:13.000-07:00
* ENH: Implement more efficient merge for arrow strings

* Fix typing

* Update

* ENH: Implement more efficient merge for arrow strings
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -591,6 +591,7 @@ Performance improvements
 - Performance improvement in :class:`Series` reductions (:issue:`52341`)
 - Performance improvement in :func:`concat` when ``axis=1`` and objects have different indexes (:issue:`52541`)
 - Performance improvement in :func:`concat` when the concatenation axis is a :class:`MultiIndex` (:issue:`53574`)
+- Performance improvement in :func:`merge` for PyArrow backed strings (:issue:`54443`)
 - Performance improvement in :func:`read_csv` with ``engine="c"`` (:issue:`52632`)
 - Performance improvement in :meth:`.DataFrameGroupBy.groups` (:issue:`53088`)
 - Performance improvement in :meth:`DataFrame.astype` when ``dtype`` is an extension dtype (:issue:`54299`)
@@ -611,7 +612,6 @@ Performance improvements
 - Performance improvement when doing various reshaping operations on :class:`arrays.IntegerArrays` & :class:`arrays.FloatingArray` by avoiding doing unnecessary validation (:issue:`53013`)
 - Performance improvement when indexing with pyarrow timestamp and duration dtypes (:issue:`53368`)
 - Performance improvement when passing an array to :meth:`RangeIndex.take`, :meth:`DataFrame.loc`, or :meth:`DataFrame.iloc` and the DataFrame is using a RangeIndex (:issue:`53387`)
--
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_210.bug_fixes:
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -60,6 +60,7 @@
     is_number,
     is_numeric_dtype,
     is_object_dtype,
+    is_string_dtype,
     needs_i8_conversion,
 )
 from pandas.core.dtypes.dtypes import (
@@ -2401,13 +2402,39 @@ def _factorize_keys(
         if not isinstance(lk, BaseMaskedArray) and not (
             # exclude arrow dtypes that would get cast to object
             isinstance(lk.dtype, ArrowDtype)
-            and is_numeric_dtype(lk.dtype.numpy_dtype)
+            and (
+                is_numeric_dtype(lk.dtype.numpy_dtype)
+                or is_string_dtype(lk.dtype)
+                and not sort
+            )
         ):
             lk, _ = lk._values_for_factorize()
 
             # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute
             # "_values_for_factorize"
             rk, _ = rk._values_for_factorize()  # type: ignore[union-attr]
+        elif isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype):
+            import pyarrow as pa
+            import pyarrow.compute as pc
+
+            len_lk = len(lk)
+            lk = lk._pa_array  # type: ignore[attr-defined]
+            rk = rk._pa_array  # type: ignore[union-attr]
+            dc = (
+                pa.chunked_array(lk.chunks + rk.chunks)  # type: ignore[union-attr]
+                .combine_chunks()
+                .dictionary_encode()
+            )
+            length = len(dc.dictionary)
+
+            llab, rlab, count = (
+                pc.fill_null(dc.indices[slice(len_lk)], length).to_numpy(),
+                pc.fill_null(dc.indices[slice(len_lk, None)], length).to_numpy(),
+                len(dc.dictionary),
+            )
+            if how == "right":
+                return rlab, llab, count
+            return llab, rlab, count
 
     if needs_i8_conversion(lk.dtype) and lk.dtype == rk.dtype:
         # GH#23917 TODO: Needs tests for non-matching dtypes