|
60 | 60 | is_number,
|
61 | 61 | is_numeric_dtype,
|
62 | 62 | is_object_dtype,
|
| 63 | + is_string_dtype, |
63 | 64 | needs_i8_conversion,
|
64 | 65 | )
|
65 | 66 | from pandas.core.dtypes.dtypes import (
|
@@ -2401,13 +2402,39 @@ def _factorize_keys(
|
2401 | 2402 | if not isinstance(lk, BaseMaskedArray) and not (
|
2402 | 2403 | # exclude arrow dtypes that would get cast to object
|
2403 | 2404 | isinstance(lk.dtype, ArrowDtype)
|
2404 |
| - and is_numeric_dtype(lk.dtype.numpy_dtype) |
| 2405 | + and ( |
| 2406 | + is_numeric_dtype(lk.dtype.numpy_dtype) |
| 2407 | + or is_string_dtype(lk.dtype) |
| 2408 | + and not sort |
| 2409 | + ) |
2405 | 2410 | ):
|
2406 | 2411 | lk, _ = lk._values_for_factorize()
|
2407 | 2412 |
|
2408 | 2413 | # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute
|
2409 | 2414 | # "_values_for_factorize"
|
2410 | 2415 | rk, _ = rk._values_for_factorize() # type: ignore[union-attr]
|
| 2416 | + elif isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype): |
| 2417 | + import pyarrow as pa |
| 2418 | + import pyarrow.compute as pc |
| 2419 | + |
| 2420 | + len_lk = len(lk) |
| 2421 | + lk = lk._pa_array # type: ignore[attr-defined] |
| 2422 | + rk = rk._pa_array # type: ignore[union-attr] |
| 2423 | + dc = ( |
| 2424 | + pa.chunked_array(lk.chunks + rk.chunks) # type: ignore[union-attr] |
| 2425 | + .combine_chunks() |
| 2426 | + .dictionary_encode() |
| 2427 | + ) |
| 2428 | + length = len(dc.dictionary) |
| 2429 | + |
| 2430 | + llab, rlab, count = ( |
| 2431 | + pc.fill_null(dc.indices[slice(len_lk)], length).to_numpy(), |
| 2432 | + pc.fill_null(dc.indices[slice(len_lk, None)], length).to_numpy(), |
| 2433 | + len(dc.dictionary), |
| 2434 | + ) |
| 2435 | + if how == "right": |
| 2436 | + return rlab, llab, count |
| 2437 | + return llab, rlab, count |
2411 | 2438 |
|
2412 | 2439 | if needs_i8_conversion(lk.dtype) and lk.dtype == rk.dtype:
|
2413 | 2440 | # GH#23917 TODO: Needs tests for non-matching dtypes
|
|
0 commit comments