4
4
"""
5
5
from __future__ import annotations
6
6
7
- import inspect
8
7
import operator
9
8
from textwrap import dedent
10
9
from typing import (
@@ -524,7 +523,7 @@ def f(c, v):
524
523
525
524
def factorize_array (
526
525
values : np .ndarray ,
527
- na_sentinel : int | None = - 1 ,
526
+ use_na_sentinel : bool = True ,
528
527
size_hint : int | None = None ,
529
528
na_value : object = None ,
530
529
mask : npt .NDArray [np .bool_ ] | None = None ,
@@ -537,7 +536,10 @@ def factorize_array(
537
536
Parameters
538
537
----------
539
538
values : ndarray
540
- na_sentinel : int, default -1
539
+ use_na_sentinel : bool, default True
540
+ If True, the sentinel -1 will be used for NaN values. If False,
541
+ NaN values will be encoded as non-negative integers and will not drop the
542
+ NaN from the uniques of the values.
541
543
size_hint : int, optional
542
544
Passed through to the hashtable's 'get_labels' method
543
545
na_value : object, optional
@@ -555,10 +557,6 @@ def factorize_array(
555
557
codes : ndarray[np.intp]
556
558
uniques : ndarray
557
559
"""
558
- ignore_na = na_sentinel is not None
559
- if not ignore_na :
560
- na_sentinel = - 1
561
-
562
560
original = values
563
561
if values .dtype .kind in ["m" , "M" ]:
564
562
# _get_hashtable_algo will cast dt64/td64 to i8 via _ensure_data, so we
@@ -572,10 +570,10 @@ def factorize_array(
572
570
table = hash_klass (size_hint or len (values ))
573
571
uniques , codes = table .factorize (
574
572
values ,
575
- na_sentinel = na_sentinel ,
573
+ na_sentinel = - 1 ,
576
574
na_value = na_value ,
577
575
mask = mask ,
578
- ignore_na = ignore_na ,
576
+ ignore_na = use_na_sentinel ,
579
577
)
580
578
581
579
# re-cast e.g. i8->dt64/td64, uint8->bool
@@ -610,8 +608,7 @@ def factorize_array(
610
608
def factorize (
611
609
values ,
612
610
sort : bool = False ,
613
- na_sentinel : int | None | lib .NoDefault = lib .no_default ,
614
- use_na_sentinel : bool | lib .NoDefault = lib .no_default ,
611
+ use_na_sentinel : bool = True ,
615
612
size_hint : int | None = None ,
616
613
) -> tuple [np .ndarray , np .ndarray | Index ]:
617
614
"""
@@ -625,17 +622,6 @@ def factorize(
625
622
Parameters
626
623
----------
627
624
{values}{sort}
628
- na_sentinel : int or None, default -1
629
- Value to mark "not found". If None, will not drop the NaN
630
- from the uniques of the values.
631
-
632
- .. deprecated:: 1.5.0
633
- The na_sentinel argument is deprecated and
634
- will be removed in a future version of pandas. Specify use_na_sentinel as
635
- either True or False.
636
-
637
- .. versionchanged:: 1.1.2
638
-
639
625
use_na_sentinel : bool, default True
640
626
If True, the sentinel -1 will be used for NaN values. If False,
641
627
NaN values will be encoded as non-negative integers and will not drop the
@@ -748,12 +734,6 @@ def factorize(
748
734
# Step 2 is dispatched to extension types (like Categorical). They are
749
735
# responsible only for factorization. All data coercion, sorting and boxing
750
736
# should happen here.
751
-
752
- # GH#46910 deprecated na_sentinel in favor of use_na_sentinel:
753
- # na_sentinel=None corresponds to use_na_sentinel=False
754
- # na_sentinel=-1 correspond to use_na_sentinel=True
755
- # Other na_sentinel values will not be supported when the deprecation is enforced.
756
- na_sentinel = resolve_na_sentinel (na_sentinel , use_na_sentinel )
757
737
if isinstance (values , ABCRangeIndex ):
758
738
return values .factorize (sort = sort )
759
739
@@ -772,25 +752,12 @@ def factorize(
772
752
return _re_wrap_factorize (original , uniques , codes )
773
753
774
754
elif not isinstance (values .dtype , np .dtype ):
775
- if (
776
- na_sentinel == - 1 or na_sentinel is None
777
- ) and "use_na_sentinel" in inspect .signature (values .factorize ).parameters :
778
- # Avoid using catch_warnings when possible
779
- # GH#46910 - TimelikeOps has deprecated signature
780
- codes , uniques = values .factorize ( # type: ignore[call-arg]
781
- use_na_sentinel = na_sentinel is not None
782
- )
783
- else :
784
- na_sentinel_arg = - 1 if na_sentinel is None else na_sentinel
785
- with warnings .catch_warnings ():
786
- # We've already warned above
787
- warnings .filterwarnings ("ignore" , ".*use_na_sentinel.*" , FutureWarning )
788
- codes , uniques = values .factorize (na_sentinel = na_sentinel_arg )
755
+ codes , uniques = values .factorize (use_na_sentinel = use_na_sentinel )
789
756
790
757
else :
791
758
values = np .asarray (values ) # convert DTA/TDA/MultiIndex
792
759
793
- if na_sentinel is None and is_object_dtype (values ):
760
+ if not use_na_sentinel and is_object_dtype (values ):
794
761
# factorize can now handle differentiating various types of null values.
795
762
# These can only occur when the array has object dtype.
796
763
# However, for backwards compatibility we only use the null for the
@@ -803,70 +770,24 @@ def factorize(
803
770
804
771
codes , uniques = factorize_array (
805
772
values ,
806
- na_sentinel = na_sentinel ,
773
+ use_na_sentinel = use_na_sentinel ,
807
774
size_hint = size_hint ,
808
775
)
809
776
810
777
if sort and len (uniques ) > 0 :
811
778
uniques , codes = safe_sort (
812
- uniques , codes , na_sentinel = na_sentinel , assume_unique = True , verify = False
779
+ uniques ,
780
+ codes ,
781
+ use_na_sentinel = use_na_sentinel ,
782
+ assume_unique = True ,
783
+ verify = False ,
813
784
)
814
785
815
786
uniques = _reconstruct_data (uniques , original .dtype , original )
816
787
817
788
return _re_wrap_factorize (original , uniques , codes )
818
789
819
790
820
- def resolve_na_sentinel (
821
- na_sentinel : int | None | lib .NoDefault ,
822
- use_na_sentinel : bool | lib .NoDefault ,
823
- ) -> int | None :
824
- """
825
- Determine value of na_sentinel for factorize methods.
826
-
827
- See GH#46910 for details on the deprecation.
828
-
829
- Parameters
830
- ----------
831
- na_sentinel : int, None, or lib.no_default
832
- Value passed to the method.
833
- use_na_sentinel : bool or lib.no_default
834
- Value passed to the method.
835
-
836
- Returns
837
- -------
838
- Resolved value of na_sentinel.
839
- """
840
- if na_sentinel is not lib .no_default and use_na_sentinel is not lib .no_default :
841
- raise ValueError (
842
- "Cannot specify both `na_sentinel` and `use_na_sentile`; "
843
- f"got `na_sentinel={ na_sentinel } ` and `use_na_sentinel={ use_na_sentinel } `"
844
- )
845
- if na_sentinel is lib .no_default :
846
- result = - 1 if use_na_sentinel is lib .no_default or use_na_sentinel else None
847
- else :
848
- if na_sentinel is None :
849
- msg = (
850
- "Specifying `na_sentinel=None` is deprecated, specify "
851
- "`use_na_sentinel=False` instead."
852
- )
853
- elif na_sentinel == - 1 :
854
- msg = (
855
- "Specifying `na_sentinel=-1` is deprecated, specify "
856
- "`use_na_sentinel=True` instead."
857
- )
858
- else :
859
- msg = (
860
- "Specifying the specific value to use for `na_sentinel` is "
861
- "deprecated and will be removed in a future version of pandas. "
862
- "Specify `use_na_sentinel=True` to use the sentinel value -1, and "
863
- "`use_na_sentinel=False` to encode NaN values."
864
- )
865
- warnings .warn (msg , FutureWarning , stacklevel = find_stack_level ())
866
- result = na_sentinel
867
- return result
868
-
869
-
870
791
def _re_wrap_factorize (original , uniques , codes : np .ndarray ):
871
792
"""
872
793
Wrap factorize results in Series or Index depending on original type.
@@ -1764,7 +1685,7 @@ def diff(arr, n: int, axis: AxisInt = 0):
1764
1685
def safe_sort (
1765
1686
values ,
1766
1687
codes = None ,
1767
- na_sentinel : int | None = - 1 ,
1688
+ use_na_sentinel : bool = True ,
1768
1689
assume_unique : bool = False ,
1769
1690
verify : bool = True ,
1770
1691
) -> AnyArrayLike | tuple [AnyArrayLike , np .ndarray ]:
@@ -1780,16 +1701,17 @@ def safe_sort(
1780
1701
Sequence; must be unique if ``codes`` is not None.
1781
1702
codes : list_like, optional
1782
1703
Indices to ``values``. All out of bound indices are treated as
1783
- "not found" and will be masked with ``na_sentinel``.
1784
- na_sentinel : int or None, default -1
1785
- Value in ``codes`` to mark "not found", or None to encode null values as normal.
1786
- Ignored when ``codes`` is None.
1704
+ "not found" and will be masked with ``-1``.
1705
+ use_na_sentinel : bool, default True
1706
+ If True, the sentinel -1 will be used for NaN values. If False,
1707
+ NaN values will be encoded as non-negative integers and will not drop the
1708
+ NaN from the uniques of the values.
1787
1709
assume_unique : bool, default False
1788
1710
When True, ``values`` are assumed to be unique, which can speed up
1789
1711
the calculation. Ignored when ``codes`` is None.
1790
1712
verify : bool, default True
1791
1713
Check if codes are out of bound for the values and put out of bound
1792
- codes equal to na_sentinel . If ``verify=False``, it is assumed there
1714
+ codes equal to ``-1`` . If ``verify=False``, it is assumed there
1793
1715
are no out of bound codes. Ignored when ``codes`` is None.
1794
1716
1795
1717
.. versionadded:: 0.25.0
@@ -1867,7 +1789,7 @@ def safe_sort(
1867
1789
t .map_locations (values )
1868
1790
sorter = ensure_platform_int (t .lookup (ordered ))
1869
1791
1870
- if na_sentinel == - 1 :
1792
+ if use_na_sentinel :
1871
1793
# take_nd is faster, but only works for na_sentinels of -1
1872
1794
order2 = sorter .argsort ()
1873
1795
new_codes = take_nd (order2 , codes , fill_value = - 1 )
@@ -1878,17 +1800,17 @@ def safe_sort(
1878
1800
else :
1879
1801
reverse_indexer = np .empty (len (sorter ), dtype = np .int_ )
1880
1802
reverse_indexer .put (sorter , np .arange (len (sorter )))
1881
- # Out of bound indices will be masked with `na_sentinel ` next, so we
1803
+ # Out of bound indices will be masked with `-1 ` next, so we
1882
1804
# may deal with them here without performance loss using `mode='wrap'`
1883
1805
new_codes = reverse_indexer .take (codes , mode = "wrap" )
1884
1806
1885
- if na_sentinel is not None :
1886
- mask = codes == na_sentinel
1807
+ if use_na_sentinel :
1808
+ mask = codes == - 1
1887
1809
if verify :
1888
1810
mask = mask | (codes < - len (values )) | (codes >= len (values ))
1889
1811
1890
- if na_sentinel is not None and mask is not None :
1891
- np .putmask (new_codes , mask , na_sentinel )
1812
+ if use_na_sentinel and mask is not None :
1813
+ np .putmask (new_codes , mask , - 1 )
1892
1814
1893
1815
return ordered , ensure_platform_int (new_codes )
1894
1816
0 commit comments