Skip to content

Commit 163b003

Browse files
authored
REF: Refactor merge array conversions for factorizer (#49978)
1 parent f26d225 commit 163b003

File tree

1 file changed

+36
-29
lines changed

1 file changed

+36
-29
lines changed

pandas/core/reshape/merge.py

+36-29
Original file line numberDiff line numberDiff line change
@@ -2364,35 +2364,7 @@ def _factorize_keys(
23642364
# "_values_for_factorize"
23652365
rk, _ = rk._values_for_factorize() # type: ignore[union-attr]
23662366

2367-
klass: type[libhashtable.Factorizer]
2368-
if is_numeric_dtype(lk.dtype):
2369-
if not is_dtype_equal(lk, rk):
2370-
dtype = find_common_type([lk.dtype, rk.dtype])
2371-
if isinstance(dtype, ExtensionDtype):
2372-
cls = dtype.construct_array_type()
2373-
if not isinstance(lk, ExtensionArray):
2374-
lk = cls._from_sequence(lk, dtype=dtype, copy=False)
2375-
else:
2376-
lk = lk.astype(dtype)
2377-
2378-
if not isinstance(rk, ExtensionArray):
2379-
rk = cls._from_sequence(rk, dtype=dtype, copy=False)
2380-
else:
2381-
rk = rk.astype(dtype)
2382-
else:
2383-
lk = lk.astype(dtype)
2384-
rk = rk.astype(dtype)
2385-
if isinstance(lk, BaseMaskedArray):
2386-
# Invalid index type "type" for "Dict[Type[object], Type[Factorizer]]";
2387-
# expected type "Type[object]"
2388-
klass = _factorizers[lk.dtype.type] # type: ignore[index]
2389-
else:
2390-
klass = _factorizers[lk.dtype.type]
2391-
2392-
else:
2393-
klass = libhashtable.ObjectFactorizer
2394-
lk = ensure_object(lk)
2395-
rk = ensure_object(rk)
2367+
klass, lk, rk = _convert_arrays_and_get_rizer_klass(lk, rk)
23962368

23972369
rizer = klass(max(len(lk), len(rk)))
23982370

@@ -2433,6 +2405,41 @@ def _factorize_keys(
24332405
return llab, rlab, count
24342406

24352407

2408+
def _convert_arrays_and_get_rizer_klass(
2409+
lk: ArrayLike, rk: ArrayLike
2410+
) -> tuple[type[libhashtable.Factorizer], ArrayLike, ArrayLike]:
2411+
klass: type[libhashtable.Factorizer]
2412+
if is_numeric_dtype(lk.dtype):
2413+
if not is_dtype_equal(lk, rk):
2414+
dtype = find_common_type([lk.dtype, rk.dtype])
2415+
if isinstance(dtype, ExtensionDtype):
2416+
cls = dtype.construct_array_type()
2417+
if not isinstance(lk, ExtensionArray):
2418+
lk = cls._from_sequence(lk, dtype=dtype, copy=False)
2419+
else:
2420+
lk = lk.astype(dtype)
2421+
2422+
if not isinstance(rk, ExtensionArray):
2423+
rk = cls._from_sequence(rk, dtype=dtype, copy=False)
2424+
else:
2425+
rk = rk.astype(dtype)
2426+
else:
2427+
lk = lk.astype(dtype)
2428+
rk = rk.astype(dtype)
2429+
if isinstance(lk, BaseMaskedArray):
2430+
# Invalid index type "type" for "Dict[Type[object], Type[Factorizer]]";
2431+
# expected type "Type[object]"
2432+
klass = _factorizers[lk.dtype.type] # type: ignore[index]
2433+
else:
2434+
klass = _factorizers[lk.dtype.type]
2435+
2436+
else:
2437+
klass = libhashtable.ObjectFactorizer
2438+
lk = ensure_object(lk)
2439+
rk = ensure_object(rk)
2440+
return klass, lk, rk
2441+
2442+
24362443
def _sort_labels(
24372444
uniques: np.ndarray, left: npt.NDArray[np.intp], right: npt.NDArray[np.intp]
24382445
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:

0 commit comments

Comments
 (0)