45
45
import pandas .core .algorithms as algos
46
46
from pandas .core .arrays .categorical import _recode_for_categories
47
47
import pandas .core .common as com
48
+ from pandas .core .construction import extract_array
48
49
from pandas .core .frame import _merge_doc
49
50
from pandas .core .internals import concatenate_block_managers
50
51
from pandas .core .sorting import is_int64_overflow_possible
@@ -1820,9 +1821,14 @@ def _right_outer_join(x, y, max_groups):
1820
1821
1821
1822
def _factorize_keys (lk , rk , sort = True ):
1822
1823
# Some pre-processing for non-ndarray lk / rk
1823
- if is_datetime64tz_dtype (lk ) and is_datetime64tz_dtype (rk ):
1824
- lk = getattr (lk , "_values" , lk )._data
1825
- rk = getattr (rk , "_values" , rk )._data
1824
+ lk = extract_array (lk , extract_numpy = True )
1825
+ rk = extract_array (rk , extract_numpy = True )
1826
+
1827
+ if is_datetime64tz_dtype (lk .dtype ) and is_datetime64tz_dtype (rk .dtype ):
1828
+ # Extract the ndarray (UTC-localized) values
1829
+ # Note: we dont need the dtypes to match, as these can still be compared
1830
+ lk , _ = lk ._values_for_factorize ()
1831
+ rk , _ = rk ._values_for_factorize ()
1826
1832
1827
1833
elif (
1828
1834
is_categorical_dtype (lk ) and is_categorical_dtype (rk ) and lk .is_dtype_equal (rk )
@@ -1837,27 +1843,23 @@ def _factorize_keys(lk, rk, sort=True):
1837
1843
lk = ensure_int64 (lk .codes )
1838
1844
rk = ensure_int64 (rk )
1839
1845
1840
- elif (
1841
- is_extension_array_dtype (lk .dtype )
1842
- and is_extension_array_dtype (rk .dtype )
1843
- and lk .dtype == rk .dtype
1844
- ):
1846
+ elif is_extension_array_dtype (lk .dtype ) and is_dtype_equal (lk .dtype , rk .dtype ):
1845
1847
lk , _ = lk ._values_for_factorize ()
1846
1848
rk , _ = rk ._values_for_factorize ()
1847
1849
1848
1850
if is_integer_dtype (lk ) and is_integer_dtype (rk ):
1849
1851
# GH#23917 TODO: needs tests for case where lk is integer-dtype
1850
1852
# and rk is datetime-dtype
1851
1853
klass = libhashtable .Int64Factorizer
1852
- lk = ensure_int64 (com .values_from_object (lk ))
1853
- rk = ensure_int64 (com .values_from_object (rk ))
1854
- elif issubclass (lk .dtype .type , (np .timedelta64 , np .datetime64 )) and issubclass (
1855
- rk .dtype .type , (np .timedelta64 , np .datetime64 )
1856
- ):
1854
+ lk = ensure_int64 (np .asarray (lk ))
1855
+ rk = ensure_int64 (np .asarray (rk ))
1856
+
1857
+ elif needs_i8_conversion (lk .dtype ) and is_dtype_equal (lk .dtype , rk .dtype ):
1857
1858
# GH#23917 TODO: Needs tests for non-matching dtypes
1858
1859
klass = libhashtable .Int64Factorizer
1859
- lk = ensure_int64 (com .values_from_object (lk ))
1860
- rk = ensure_int64 (com .values_from_object (rk ))
1860
+ lk = ensure_int64 (np .asarray (lk , dtype = np .int64 ))
1861
+ rk = ensure_int64 (np .asarray (rk , dtype = np .int64 ))
1862
+
1861
1863
else :
1862
1864
klass = libhashtable .Factorizer
1863
1865
lk = ensure_object (lk )
0 commit comments