Skip to content

Commit 1845699

Browse files
authored
Speed up StringDtype arrow implementation for merge (#54510)
1 parent cc76b52 commit 1845699

File tree

1 file changed

+18
-14
lines changed

1 file changed

+18
-14
lines changed

pandas/core/reshape/merge.py

+18-14
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@
9090
ExtensionArray,
9191
)
9292
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
93+
from pandas.core.arrays.string_ import StringDtype
9394
import pandas.core.common as com
9495
from pandas.core.construction import (
9596
ensure_wrapped_if_datetimelike,
@@ -2399,21 +2400,9 @@ def _factorize_keys(
23992400
rk = ensure_int64(rk.codes)
24002401

24012402
elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype:
2402-
if not isinstance(lk, BaseMaskedArray) and not (
2403-
# exclude arrow dtypes that would get cast to object
2404-
isinstance(lk.dtype, ArrowDtype)
2405-
and (
2406-
is_numeric_dtype(lk.dtype.numpy_dtype)
2407-
or is_string_dtype(lk.dtype)
2408-
and not sort
2409-
)
2403+
if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or (
2404+
isinstance(lk.dtype, StringDtype) and lk.dtype.storage == "pyarrow"
24102405
):
2411-
lk, _ = lk._values_for_factorize()
2412-
2413-
# error: Item "ndarray" of "Union[Any, ndarray]" has no attribute
2414-
# "_values_for_factorize"
2415-
rk, _ = rk._values_for_factorize() # type: ignore[union-attr]
2416-
elif isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype):
24172406
import pyarrow as pa
24182407
import pyarrow.compute as pc
24192408

@@ -2436,6 +2425,21 @@ def _factorize_keys(
24362425
return rlab, llab, count
24372426
return llab, rlab, count
24382427

2428+
if not isinstance(lk, BaseMaskedArray) and not (
2429+
# exclude arrow dtypes that would get cast to object
2430+
isinstance(lk.dtype, ArrowDtype)
2431+
and (
2432+
is_numeric_dtype(lk.dtype.numpy_dtype)
2433+
or is_string_dtype(lk.dtype)
2434+
and not sort
2435+
)
2436+
):
2437+
lk, _ = lk._values_for_factorize()
2438+
2439+
# error: Item "ndarray" of "Union[Any, ndarray]" has no attribute
2440+
# "_values_for_factorize"
2441+
rk, _ = rk._values_for_factorize() # type: ignore[union-attr]
2442+
24392443
if needs_i8_conversion(lk.dtype) and lk.dtype == rk.dtype:
24402444
# GH#23917 TODO: Needs tests for non-matching dtypes
24412445
# GH#23917 TODO: needs tests for case where lk is integer-dtype

0 commit comments

Comments
 (0)