@@ -762,10 +762,6 @@ def factorize(
762
762
if not isinstance (values , ABCMultiIndex ):
763
763
values = extract_array (values , extract_numpy = True )
764
764
765
- # GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques
766
- # of values, assign na_sentinel=-1 to replace code value for NaN.
767
- dropna = na_sentinel is not None
768
-
769
765
if (
770
766
isinstance (values , (ABCDatetimeArray , ABCTimedeltaArray ))
771
767
and values .freq is not None
@@ -793,17 +789,8 @@ def factorize(
793
789
794
790
else :
795
791
values = np .asarray (values ) # convert DTA/TDA/MultiIndex
796
- # TODO: pass na_sentinel=na_sentinel to factorize_array. When sort is True and
797
- # na_sentinel is None we append NA on the end because safe_sort does not
798
- # handle null values in uniques.
799
- if na_sentinel is None and sort :
800
- na_sentinel_arg = - 1
801
- elif na_sentinel is None :
802
- na_sentinel_arg = None
803
- else :
804
- na_sentinel_arg = na_sentinel
805
792
806
- if not dropna and not sort and is_object_dtype (values ):
793
+ if na_sentinel is None and is_object_dtype (values ):
807
794
# factorize can now handle differentiating various types of null values.
808
795
# These can only occur when the array has object dtype.
809
796
# However, for backwards compatibility we only use the null for the
@@ -816,32 +803,15 @@ def factorize(
816
803
817
804
codes , uniques = factorize_array (
818
805
values ,
819
- na_sentinel = na_sentinel_arg ,
806
+ na_sentinel = na_sentinel ,
820
807
size_hint = size_hint ,
821
808
)
822
809
823
810
if sort and len (uniques ) > 0 :
824
- if na_sentinel is None :
825
- # TODO: Can remove when na_sentinel=na_sentinel as in TODO above
826
- na_sentinel = - 1
827
811
uniques , codes = safe_sort (
828
812
uniques , codes , na_sentinel = na_sentinel , assume_unique = True , verify = False
829
813
)
830
814
831
- if not dropna and sort :
832
- # TODO: Can remove entire block when na_sentinel=na_sentinel as in TODO above
833
- if na_sentinel is None :
834
- na_sentinel_arg = - 1
835
- else :
836
- na_sentinel_arg = na_sentinel
837
- code_is_na = codes == na_sentinel_arg
838
- if code_is_na .any ():
839
- # na_value is set based on the dtype of uniques, and compat set to False is
840
- # because we do not want na_value to be 0 for integers
841
- na_value = na_value_for_dtype (uniques .dtype , compat = False )
842
- uniques = np .append (uniques , [na_value ])
843
- codes = np .where (code_is_na , len (uniques ) - 1 , codes )
844
-
845
815
uniques = _reconstruct_data (uniques , original .dtype , original )
846
816
847
817
return _re_wrap_factorize (original , uniques , codes )
@@ -1796,7 +1766,7 @@ def diff(arr, n: int, axis: AxisInt = 0):
1796
1766
def safe_sort (
1797
1767
values ,
1798
1768
codes = None ,
1799
- na_sentinel : int = - 1 ,
1769
+ na_sentinel : int | None = - 1 ,
1800
1770
assume_unique : bool = False ,
1801
1771
verify : bool = True ,
1802
1772
) -> np .ndarray | MultiIndex | tuple [np .ndarray | MultiIndex , np .ndarray ]:
@@ -1813,8 +1783,8 @@ def safe_sort(
1813
1783
codes : list_like, optional
1814
1784
Indices to ``values``. All out of bound indices are treated as
1815
1785
"not found" and will be masked with ``na_sentinel``.
1816
- na_sentinel : int, default -1
1817
- Value in ``codes`` to mark "not found".
1786
+ na_sentinel : int or None , default -1
1787
+ Value in ``codes`` to mark "not found", or None to encode null values as normal .
1818
1788
Ignored when ``codes`` is None.
1819
1789
assume_unique : bool, default False
1820
1790
When True, ``values`` are assumed to be unique, which can speed up
@@ -1920,24 +1890,25 @@ def safe_sort(
1920
1890
# may deal with them here without performance loss using `mode='wrap'`
1921
1891
new_codes = reverse_indexer .take (codes , mode = "wrap" )
1922
1892
1923
- mask = codes == na_sentinel
1924
- if verify :
1925
- mask = mask | (codes < - len (values )) | (codes >= len (values ))
1893
+ if na_sentinel is not None :
1894
+ mask = codes == na_sentinel
1895
+ if verify :
1896
+ mask = mask | (codes < - len (values )) | (codes >= len (values ))
1926
1897
1927
- if mask is not None :
1898
+ if na_sentinel is not None and mask is not None :
1928
1899
np .putmask (new_codes , mask , na_sentinel )
1929
1900
1930
1901
return ordered , ensure_platform_int (new_codes )
1931
1902
1932
1903
1933
1904
def _sort_mixed (values ) -> np .ndarray :
1934
- """order ints before strings in 1d arrays, safe in py3 """
1905
+ """order ints before strings before nulls in 1d arrays"""
1935
1906
str_pos = np .array ([isinstance (x , str ) for x in values ], dtype = bool )
1936
- none_pos = np .array ([x is None for x in values ], dtype = bool )
1937
- nums = np .sort (values [~ str_pos & ~ none_pos ])
1907
+ null_pos = np .array ([isna ( x ) for x in values ], dtype = bool )
1908
+ nums = np .sort (values [~ str_pos & ~ null_pos ])
1938
1909
strs = np .sort (values [str_pos ])
1939
1910
return np .concatenate (
1940
- [nums , np .asarray (strs , dtype = object ), np .array (values [none_pos ])]
1911
+ [nums , np .asarray (strs , dtype = object ), np .array (values [null_pos ])]
1941
1912
)
1942
1913
1943
1914
0 commit comments