4
4
"""
5
5
from __future__ import annotations
6
6
7
+ import inspect
7
8
import operator
8
9
from textwrap import dedent
9
10
from typing import (
14
15
cast ,
15
16
final ,
16
17
)
17
- from warnings import warn
18
+ import warnings
18
19
19
20
import numpy as np
20
21
@@ -586,7 +587,8 @@ def factorize_array(
586
587
def factorize (
587
588
values ,
588
589
sort : bool = False ,
589
- na_sentinel : int | None = - 1 ,
590
+ na_sentinel : int | None | lib .NoDefault = lib .no_default ,
591
+ use_na_sentinel : bool | lib .NoDefault = lib .no_default ,
590
592
size_hint : int | None = None ,
591
593
) -> tuple [np .ndarray , np .ndarray | Index ]:
592
594
"""
@@ -604,7 +606,19 @@ def factorize(
604
606
Value to mark "not found". If None, will not drop the NaN
605
607
from the uniques of the values.
606
608
609
+ .. deprecated:: 1.5.0
610
+ The na_sentinel argument is deprecated and
611
+ will be removed in a future version of pandas. Specify use_na_sentinel as
612
+ either True or False.
613
+
607
614
.. versionchanged:: 1.1.2
615
+
616
+ use_na_sentinel : bool, default True
617
+ If True, the sentinel -1 will be used for NaN values. If False,
618
+ NaN values will be encoded as non-negative integers and will not drop the
619
+ NaN from the uniques of the values.
620
+
621
+ .. versionadded:: 1.5.0
608
622
{size_hint}\
609
623
610
624
Returns
@@ -652,8 +666,8 @@ def factorize(
652
666
>>> uniques
653
667
array(['a', 'b', 'c'], dtype=object)
654
668
655
- Missing values are indicated in `codes` with `na_sentinel`
656
- (`` -1`` by default). Note that missing values are never
669
+ When ``use_na_sentinel=True`` (the default), missing values are indicated in
670
+ the `codes` with the sentinel value `` -1`` and missing values are not
657
671
included in `uniques`.
658
672
659
673
>>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
@@ -688,16 +702,16 @@ def factorize(
688
702
Index(['a', 'c'], dtype='object')
689
703
690
704
If NaN is in the values, and we want to include NaN in the uniques of the
691
- values, it can be achieved by setting ``na_sentinel=None ``.
705
+ values, it can be achieved by setting ``use_na_sentinel=False ``.
692
706
693
707
>>> values = np.array([1, 2, 1, np.nan])
694
- >>> codes, uniques = pd.factorize(values) # default: na_sentinel=-1
708
+ >>> codes, uniques = pd.factorize(values) # default: use_na_sentinel=True
695
709
>>> codes
696
710
array([ 0, 1, 0, -1])
697
711
>>> uniques
698
712
array([1., 2.])
699
713
700
- >>> codes, uniques = pd.factorize(values, na_sentinel=None )
714
+ >>> codes, uniques = pd.factorize(values, use_na_sentinel=False )
701
715
>>> codes
702
716
array([0, 1, 0, 2])
703
717
>>> uniques
@@ -712,6 +726,7 @@ def factorize(
712
726
# responsible only for factorization. All data coercion, sorting and boxing
713
727
# should happen here.
714
728
729
+ na_sentinel = resolve_na_sentinel (na_sentinel , use_na_sentinel )
715
730
if isinstance (values , ABCRangeIndex ):
716
731
return values .factorize (sort = sort )
717
732
@@ -736,9 +751,22 @@ def factorize(
736
751
codes , uniques = values .factorize (sort = sort )
737
752
return _re_wrap_factorize (original , uniques , codes )
738
753
739
- if not isinstance (values .dtype , np .dtype ):
740
- # i.e. ExtensionDtype
741
- codes , uniques = values .factorize (na_sentinel = na_sentinel )
754
+ elif not isinstance (values .dtype , np .dtype ):
755
+ if (
756
+ na_sentinel == - 1
757
+ and "use_na_sentinel" in inspect .signature (values .factorize ).parameters
758
+ ):
759
+ # Avoid using catch_warnings when possible
760
+ # GH#46910 - TimelikeOps has deprecated signature
761
+ codes , uniques = values .factorize ( # type: ignore[call-arg]
762
+ use_na_sentinel = True
763
+ )
764
+ else :
765
+ with warnings .catch_warnings ():
766
+ # We've already warned above
767
+ warnings .filterwarnings ("ignore" , ".*use_na_sentinel.*" , FutureWarning )
768
+ codes , uniques = values .factorize (na_sentinel = na_sentinel )
769
+
742
770
else :
743
771
values = np .asarray (values ) # convert DTA/TDA/MultiIndex
744
772
codes , uniques = factorize_array (
@@ -763,6 +791,56 @@ def factorize(
763
791
return _re_wrap_factorize (original , uniques , codes )
764
792
765
793
794
+ def resolve_na_sentinel (
795
+ na_sentinel : int | None | lib .NoDefault ,
796
+ use_na_sentinel : bool | lib .NoDefault ,
797
+ ) -> int | None :
798
+ """
799
+ Determine value of na_sentinel for factorize methods.
800
+
801
+ See GH#46910 for details on the deprecation.
802
+
803
+ Parameters
804
+ ----------
805
+ na_sentinel : int, None, or lib.no_default
806
+ Value passed to the method.
807
+ use_na_sentinel : bool or lib.no_default
808
+ Value passed to the method.
809
+
810
+ Returns
811
+ -------
812
+ Resolved value of na_sentinel.
813
+ """
814
+ if na_sentinel is not lib .no_default and use_na_sentinel is not lib .no_default :
815
+ raise ValueError (
816
+ "Cannot specify both `na_sentinel` and `use_na_sentile`; "
817
+ f"got `na_sentinel={ na_sentinel } ` and `use_na_sentinel={ use_na_sentinel } `"
818
+ )
819
+ if na_sentinel is lib .no_default :
820
+ result = - 1 if use_na_sentinel is lib .no_default or use_na_sentinel else None
821
+ else :
822
+ if na_sentinel is None :
823
+ msg = (
824
+ "Specifying `na_sentinel=None` is deprecated, specify "
825
+ "`use_na_sentinel=False` instead."
826
+ )
827
+ elif na_sentinel == - 1 :
828
+ msg = (
829
+ "Specifying `na_sentinel=-1` is deprecated, specify "
830
+ "`use_na_sentinel=True` instead."
831
+ )
832
+ else :
833
+ msg = (
834
+ "Specifying the specific value to use for `na_sentinel` is "
835
+ "deprecated and will be removed in a future version of pandas. "
836
+ "Specify `use_na_sentinel=True` to use the sentinel value -1, and "
837
+ "`use_na_sentinel=False` to encode NaN values."
838
+ )
839
+ warnings .warn (msg , FutureWarning , stacklevel = find_stack_level ())
840
+ result = na_sentinel
841
+ return result
842
+
843
+
766
844
def _re_wrap_factorize (original , uniques , codes : np .ndarray ):
767
845
"""
768
846
Wrap factorize results in Series or Index depending on original type.
@@ -956,7 +1034,7 @@ def mode(
956
1034
try :
957
1035
npresult = np .sort (npresult )
958
1036
except TypeError as err :
959
- warn (f"Unable to sort modes: { err } " )
1037
+ warnings . warn (f"Unable to sort modes: { err } " )
960
1038
961
1039
result = _reconstruct_data (npresult , original .dtype , original )
962
1040
return result
@@ -1576,7 +1654,7 @@ def diff(arr, n: int, axis: int = 0):
1576
1654
raise ValueError (f"cannot diff { type (arr ).__name__ } on axis={ axis } " )
1577
1655
return op (arr , arr .shift (n ))
1578
1656
else :
1579
- warn (
1657
+ warnings . warn (
1580
1658
"dtype lost in 'diff()'. In the future this will raise a "
1581
1659
"TypeError. Convert to a suitable dtype prior to calling 'diff'." ,
1582
1660
FutureWarning ,
0 commit comments