4
4
"""
5
5
from __future__ import annotations
6
6
7
+ import inspect
7
8
import operator
8
9
from textwrap import dedent
9
10
from typing import (
14
15
cast ,
15
16
final ,
16
17
)
17
- from warnings import warn
18
+ import warnings
18
19
19
20
import numpy as np
20
21
57
58
is_numeric_dtype ,
58
59
is_object_dtype ,
59
60
is_scalar ,
61
+ is_signed_integer_dtype ,
60
62
is_timedelta64_dtype ,
61
63
needs_i8_conversion ,
62
64
)
@@ -446,7 +448,12 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> npt.NDArray[np.bool_]:
446
448
)
447
449
448
450
if not isinstance (values , (ABCIndex , ABCSeries , ABCExtensionArray , np .ndarray )):
449
- values = _ensure_arraylike (list (values ))
451
+ if not is_signed_integer_dtype (comps ):
452
+ # GH#46485 Use object to avoid upcast to float64 later
453
+ # TODO: Share with _find_common_type_compat
454
+ values = construct_1d_object_array_from_listlike (list (values ))
455
+ else :
456
+ values = _ensure_arraylike (list (values ))
450
457
elif isinstance (values , ABCMultiIndex ):
451
458
# Avoid raising in extract_array
452
459
values = np .array (values )
@@ -580,7 +587,8 @@ def factorize_array(
580
587
def factorize (
581
588
values ,
582
589
sort : bool = False ,
583
- na_sentinel : int | None = - 1 ,
590
+ na_sentinel : int | None | lib .NoDefault = lib .no_default ,
591
+ use_na_sentinel : bool | lib .NoDefault = lib .no_default ,
584
592
size_hint : int | None = None ,
585
593
) -> tuple [np .ndarray , np .ndarray | Index ]:
586
594
"""
@@ -598,7 +606,19 @@ def factorize(
598
606
Value to mark "not found". If None, will not drop the NaN
599
607
from the uniques of the values.
600
608
609
+ .. deprecated:: 1.5.0
610
+ The na_sentinel argument is deprecated and
611
+ will be removed in a future version of pandas. Specify use_na_sentinel as
612
+ either True or False.
613
+
601
614
.. versionchanged:: 1.1.2
615
+
616
+ use_na_sentinel : bool, default True
617
+ If True, the sentinel -1 will be used for NaN values. If False,
618
+ NaN values will be encoded as non-negative integers and will not drop the
619
+ NaN from the uniques of the values.
620
+
621
+ .. versionadded:: 1.5.0
602
622
{size_hint}\
603
623
604
624
Returns
@@ -646,8 +666,8 @@ def factorize(
646
666
>>> uniques
647
667
array(['a', 'b', 'c'], dtype=object)
648
668
649
- Missing values are indicated in `codes` with `na_sentinel`
650
- (`` -1`` by default). Note that missing values are never
669
+ When ``use_na_sentinel=True`` (the default), missing values are indicated in
670
+ the `codes` with the sentinel value `` -1`` and missing values are not
651
671
included in `uniques`.
652
672
653
673
>>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
@@ -682,16 +702,16 @@ def factorize(
682
702
Index(['a', 'c'], dtype='object')
683
703
684
704
If NaN is in the values, and we want to include NaN in the uniques of the
685
- values, it can be achieved by setting ``na_sentinel=None ``.
705
+ values, it can be achieved by setting ``use_na_sentinel=False ``.
686
706
687
707
>>> values = np.array([1, 2, 1, np.nan])
688
- >>> codes, uniques = pd.factorize(values) # default: na_sentinel=-1
708
+ >>> codes, uniques = pd.factorize(values) # default: use_na_sentinel=True
689
709
>>> codes
690
710
array([ 0, 1, 0, -1])
691
711
>>> uniques
692
712
array([1., 2.])
693
713
694
- >>> codes, uniques = pd.factorize(values, na_sentinel=None )
714
+ >>> codes, uniques = pd.factorize(values, use_na_sentinel=False )
695
715
>>> codes
696
716
array([0, 1, 0, 2])
697
717
>>> uniques
@@ -706,6 +726,7 @@ def factorize(
706
726
# responsible only for factorization. All data coercion, sorting and boxing
707
727
# should happen here.
708
728
729
+ na_sentinel = resolve_na_sentinel (na_sentinel , use_na_sentinel )
709
730
if isinstance (values , ABCRangeIndex ):
710
731
return values .factorize (sort = sort )
711
732
@@ -730,9 +751,22 @@ def factorize(
730
751
codes , uniques = values .factorize (sort = sort )
731
752
return _re_wrap_factorize (original , uniques , codes )
732
753
733
- if not isinstance (values .dtype , np .dtype ):
734
- # i.e. ExtensionDtype
735
- codes , uniques = values .factorize (na_sentinel = na_sentinel )
754
+ elif not isinstance (values .dtype , np .dtype ):
755
+ if (
756
+ na_sentinel == - 1
757
+ and "use_na_sentinel" in inspect .signature (values .factorize ).parameters
758
+ ):
759
+ # Avoid using catch_warnings when possible
760
+ # GH#46910 - TimelikeOps has deprecated signature
761
+ codes , uniques = values .factorize ( # type: ignore[call-arg]
762
+ use_na_sentinel = True
763
+ )
764
+ else :
765
+ with warnings .catch_warnings ():
766
+ # We've already warned above
767
+ warnings .filterwarnings ("ignore" , ".*use_na_sentinel.*" , FutureWarning )
768
+ codes , uniques = values .factorize (na_sentinel = na_sentinel )
769
+
736
770
else :
737
771
values = np .asarray (values ) # convert DTA/TDA/MultiIndex
738
772
codes , uniques = factorize_array (
@@ -757,6 +791,56 @@ def factorize(
757
791
return _re_wrap_factorize (original , uniques , codes )
758
792
759
793
794
+ def resolve_na_sentinel (
795
+ na_sentinel : int | None | lib .NoDefault ,
796
+ use_na_sentinel : bool | lib .NoDefault ,
797
+ ) -> int | None :
798
+ """
799
+ Determine value of na_sentinel for factorize methods.
800
+
801
+ See GH#46910 for details on the deprecation.
802
+
803
+ Parameters
804
+ ----------
805
+ na_sentinel : int, None, or lib.no_default
806
+ Value passed to the method.
807
+ use_na_sentinel : bool or lib.no_default
808
+ Value passed to the method.
809
+
810
+ Returns
811
+ -------
812
+ Resolved value of na_sentinel.
813
+ """
814
+ if na_sentinel is not lib .no_default and use_na_sentinel is not lib .no_default :
815
+ raise ValueError (
816
+ "Cannot specify both `na_sentinel` and `use_na_sentile`; "
817
+ f"got `na_sentinel={ na_sentinel } ` and `use_na_sentinel={ use_na_sentinel } `"
818
+ )
819
+ if na_sentinel is lib .no_default :
820
+ result = - 1 if use_na_sentinel is lib .no_default or use_na_sentinel else None
821
+ else :
822
+ if na_sentinel is None :
823
+ msg = (
824
+ "Specifying `na_sentinel=None` is deprecated, specify "
825
+ "`use_na_sentinel=False` instead."
826
+ )
827
+ elif na_sentinel == - 1 :
828
+ msg = (
829
+ "Specifying `na_sentinel=-1` is deprecated, specify "
830
+ "`use_na_sentinel=True` instead."
831
+ )
832
+ else :
833
+ msg = (
834
+ "Specifying the specific value to use for `na_sentinel` is "
835
+ "deprecated and will be removed in a future version of pandas. "
836
+ "Specify `use_na_sentinel=True` to use the sentinel value -1, and "
837
+ "`use_na_sentinel=False` to encode NaN values."
838
+ )
839
+ warnings .warn (msg , FutureWarning , stacklevel = find_stack_level ())
840
+ result = na_sentinel
841
+ return result
842
+
843
+
760
844
def _re_wrap_factorize (original , uniques , codes : np .ndarray ):
761
845
"""
762
846
Wrap factorize results in Series or Index depending on original type.
@@ -950,7 +1034,7 @@ def mode(
950
1034
try :
951
1035
npresult = np .sort (npresult )
952
1036
except TypeError as err :
953
- warn (f"Unable to sort modes: { err } " )
1037
+ warnings . warn (f"Unable to sort modes: { err } " )
954
1038
955
1039
result = _reconstruct_data (npresult , original .dtype , original )
956
1040
return result
@@ -1570,7 +1654,7 @@ def diff(arr, n: int, axis: int = 0):
1570
1654
raise ValueError (f"cannot diff { type (arr ).__name__ } on axis={ axis } " )
1571
1655
return op (arr , arr .shift (n ))
1572
1656
else :
1573
- warn (
1657
+ warnings . warn (
1574
1658
"dtype lost in 'diff()'. In the future this will raise a "
1575
1659
"TypeError. Convert to a suitable dtype prior to calling 'diff'." ,
1576
1660
FutureWarning ,
0 commit comments