|
17 | 17 | from pandas.core.dtypes.common import (
|
18 | 18 | ensure_float64, ensure_int64, ensure_object, is_array_like, is_bool,
|
19 | 19 | is_bool_dtype, is_categorical_dtype, is_datetime64_dtype,
|
20 |
| - is_datetime64tz_dtype, is_datetimelike, is_dtype_equal, is_float_dtype, |
21 |
| - is_int64_dtype, is_integer, is_integer_dtype, is_list_like, is_number, |
22 |
| - is_numeric_dtype, needs_i8_conversion) |
| 20 | + is_datetime64tz_dtype, is_datetimelike, is_dtype_equal, |
| 21 | + is_extension_array_dtype, is_float_dtype, is_int64_dtype, is_integer, |
| 22 | + is_integer_dtype, is_list_like, is_number, is_numeric_dtype, |
| 23 | + needs_i8_conversion) |
23 | 24 | from pandas.core.dtypes.missing import isnull, na_value_for_dtype
|
24 | 25 |
|
25 | 26 | from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timedelta
|
@@ -1589,25 +1590,31 @@ def _right_outer_join(x, y, max_groups):
|
1589 | 1590 |
|
1590 | 1591 |
|
1591 | 1592 | def _factorize_keys(lk, rk, sort=True):
|
| 1593 | + # Some pre-processing for non-ndarray lk / rk |
1592 | 1594 | if is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk):
|
1593 | 1595 | lk = lk.values
|
1594 | 1596 | rk = rk.values
|
1595 | 1597 |
|
1596 |
| - # if we exactly match in categories, allow us to factorize on codes |
1597 |
| - if (is_categorical_dtype(lk) and |
| 1598 | + elif (is_categorical_dtype(lk) and |
1598 | 1599 | is_categorical_dtype(rk) and
|
1599 | 1600 | lk.is_dtype_equal(rk)):
|
1600 |
| - klass = libhashtable.Int64Factorizer |
1601 |
| - |
1602 | 1601 | if lk.categories.equals(rk.categories):
|
| 1602 | + # if we exactly match in categories, allow us to factorize on codes |
1603 | 1603 | rk = rk.codes
|
1604 | 1604 | else:
|
1605 | 1605 | # Same categories in different orders -> recode
|
1606 | 1606 | rk = _recode_for_categories(rk.codes, rk.categories, lk.categories)
|
1607 | 1607 |
|
1608 | 1608 | lk = ensure_int64(lk.codes)
|
1609 | 1609 | rk = ensure_int64(rk)
|
1610 |
| - elif is_integer_dtype(lk) and is_integer_dtype(rk): |
| 1610 | + |
| 1611 | + elif (is_extension_array_dtype(lk.dtype) and |
| 1612 | + is_extension_array_dtype(rk.dtype) and |
| 1613 | + lk.dtype == rk.dtype): |
| 1614 | + lk, _ = lk._values_for_factorize() |
| 1615 | + rk, _ = rk._values_for_factorize() |
| 1616 | + |
| 1617 | + if is_integer_dtype(lk) and is_integer_dtype(rk): |
1611 | 1618 | # GH#23917 TODO: needs tests for case where lk is integer-dtype
|
1612 | 1619 | # and rk is datetime-dtype
|
1613 | 1620 | klass = libhashtable.Int64Factorizer
|
|
0 commit comments