diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 7e943bb5832e6..8a2be528d8168 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -721,8 +721,9 @@ Other Deprecations - Deprecated the ``closed`` argument in :class:`ArrowInterval` in favor of ``inclusive`` argument; In a future version passing ``closed`` will raise (:issue:`40245`) - Deprecated allowing ``unit="M"`` or ``unit="Y"`` in :class:`Timestamp` constructor with a non-round float value (:issue:`47267`) - Deprecated the ``display.column_space`` global configuration option (:issue:`7576`) +- Deprecated the argument ``na_sentinel`` in :func:`factorize`, :meth:`Index.factorize`, and :meth:`.ExtensionArray.factorize`; pass ``use_na_sentinel=True`` instead to use the sentinel ``-1`` for NaN values and ``use_na_sentinel=False`` instead of ``na_sentinel=None`` to encode NaN values (:issue:`46910`) - Deprecated :meth:`DataFrameGroupBy.transform` not aligning the result when the UDF returned DataFrame (:issue:`45648`) -- + .. --------------------------------------------------------------------------- .. _whatsnew_150.performance: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index e841d32bb411d..7e292f4ccf8cb 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -4,6 +4,7 @@ """ from __future__ import annotations +import inspect import operator from textwrap import dedent from typing import ( @@ -14,7 +15,7 @@ cast, final, ) -from warnings import warn +import warnings import numpy as np @@ -586,7 +587,8 @@ def factorize_array( def factorize( values, sort: bool = False, - na_sentinel: int | None = -1, + na_sentinel: int | None | lib.NoDefault = lib.no_default, + use_na_sentinel: bool | lib.NoDefault = lib.no_default, size_hint: int | None = None, ) -> tuple[np.ndarray, np.ndarray | Index]: """ @@ -604,7 +606,19 @@ def factorize( Value to mark "not found". If None, will not drop the NaN from the uniques of the values. + .. deprecated:: 1.5.0 + The na_sentinel argument is deprecated and + will be removed in a future version of pandas. Specify use_na_sentinel as + either True or False. + .. versionchanged:: 1.1.2 + + use_na_sentinel : bool, default True + If True, the sentinel -1 will be used for NaN values. If False, + NaN values will be encoded as non-negative integers and will not drop the + NaN from the uniques of the values. + + .. versionadded:: 1.5.0 {size_hint}\ Returns @@ -652,8 +666,8 @@ def factorize( >>> uniques array(['a', 'b', 'c'], dtype=object) - Missing values are indicated in `codes` with `na_sentinel` - (``-1`` by default). Note that missing values are never + When ``use_na_sentinel=True`` (the default), missing values are indicated in + the `codes` with the sentinel value ``-1`` and missing values are not included in `uniques`. >>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b']) @@ -688,16 +702,16 @@ def factorize( Index(['a', 'c'], dtype='object') If NaN is in the values, and we want to include NaN in the uniques of the - values, it can be achieved by setting ``na_sentinel=None``. + values, it can be achieved by setting ``use_na_sentinel=False``. >>> values = np.array([1, 2, 1, np.nan]) - >>> codes, uniques = pd.factorize(values) # default: na_sentinel=-1 + >>> codes, uniques = pd.factorize(values) # default: use_na_sentinel=True >>> codes array([ 0, 1, 0, -1]) >>> uniques array([1., 2.]) - >>> codes, uniques = pd.factorize(values, na_sentinel=None) + >>> codes, uniques = pd.factorize(values, use_na_sentinel=False) >>> codes array([0, 1, 0, 2]) >>> uniques @@ -712,6 +726,7 @@ def factorize( # responsible only for factorization. All data coercion, sorting and boxing # should happen here. + na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel) if isinstance(values, ABCRangeIndex): return values.factorize(sort=sort) @@ -736,9 +751,22 @@ def factorize( codes, uniques = values.factorize(sort=sort) return _re_wrap_factorize(original, uniques, codes) - if not isinstance(values.dtype, np.dtype): - # i.e. ExtensionDtype - codes, uniques = values.factorize(na_sentinel=na_sentinel) + elif not isinstance(values.dtype, np.dtype): + if ( + na_sentinel == -1 + and "use_na_sentinel" in inspect.signature(values.factorize).parameters + ): + # Avoid using catch_warnings when possible + # GH#46910 - TimelikeOps has deprecated signature + codes, uniques = values.factorize( # type: ignore[call-arg] + use_na_sentinel=True + ) + else: + with warnings.catch_warnings(): + # We've already warned above + warnings.filterwarnings("ignore", ".*use_na_sentinel.*", FutureWarning) + codes, uniques = values.factorize(na_sentinel=na_sentinel) + else: values = np.asarray(values) # convert DTA/TDA/MultiIndex codes, uniques = factorize_array( @@ -763,6 +791,56 @@ def factorize( return _re_wrap_factorize(original, uniques, codes) +def resolve_na_sentinel( + na_sentinel: int | None | lib.NoDefault, + use_na_sentinel: bool | lib.NoDefault, +) -> int | None: + """ + Determine value of na_sentinel for factorize methods. + + See GH#46910 for details on the deprecation. + + Parameters + ---------- + na_sentinel : int, None, or lib.no_default + Value passed to the method. + use_na_sentinel : bool or lib.no_default + Value passed to the method. + + Returns + ------- + Resolved value of na_sentinel. + """ + if na_sentinel is not lib.no_default and use_na_sentinel is not lib.no_default: + raise ValueError( + "Cannot specify both `na_sentinel` and `use_na_sentile`; " + f"got `na_sentinel={na_sentinel}` and `use_na_sentinel={use_na_sentinel}`" + ) + if na_sentinel is lib.no_default: + result = -1 if use_na_sentinel is lib.no_default or use_na_sentinel else None + else: + if na_sentinel is None: + msg = ( + "Specifying `na_sentinel=None` is deprecated, specify " + "`use_na_sentinel=False` instead." + ) + elif na_sentinel == -1: + msg = ( + "Specifying `na_sentinel=-1` is deprecated, specify " + "`use_na_sentinel=True` instead." + ) + else: + msg = ( + "Specifying the specific value to use for `na_sentinel` is " + "deprecated and will be removed in a future version of pandas. " + "Specify `use_na_sentinel=True` to use the sentinel value -1, and " + "`use_na_sentinel=False` to encode NaN values." + ) + warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) + result = na_sentinel + return result + + def _re_wrap_factorize(original, uniques, codes: np.ndarray): """ Wrap factorize results in Series or Index depending on original type. @@ -956,7 +1034,7 @@ def mode( try: npresult = np.sort(npresult) except TypeError as err: - warn(f"Unable to sort modes: {err}") + warnings.warn(f"Unable to sort modes: {err}") result = _reconstruct_data(npresult, original.dtype, original) return result @@ -1576,7 +1654,7 @@ def diff(arr, n: int, axis: int = 0): raise ValueError(f"cannot diff {type(arr).__name__} on axis={axis}") return op(arr, arr.shift(n)) else: - warn( + warnings.warn( "dtype lost in 'diff()'. In the future this will raise a " "TypeError. Convert to a suitable dtype prior to calling 'diff'.", FutureWarning, diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 18d965ff26e10..1c4187f2aafd2 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -8,6 +8,7 @@ import numpy as np +from pandas._libs import lib from pandas._typing import ( Dtype, PositionalIndexer, @@ -31,6 +32,7 @@ ) from pandas.core.dtypes.missing import isna +from pandas.core.algorithms import resolve_na_sentinel from pandas.core.arraylike import OpsMixin from pandas.core.arrays.base import ExtensionArray from pandas.core.indexers import ( @@ -286,7 +288,16 @@ def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: return type(self)(pc.drop_null(self._data)) @doc(ExtensionArray.factorize) - def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: + def factorize( + self, + na_sentinel: int | lib.NoDefault = lib.no_default, + use_na_sentinel: bool | lib.NoDefault = lib.no_default, + ) -> tuple[np.ndarray, ExtensionArray]: + resolved_na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel) + if resolved_na_sentinel is None: + raise NotImplementedError("Encoding NaN values is not yet implemented") + else: + na_sentinel = resolved_na_sentinel encoded = self._data.dictionary_encode() indices = pa.chunked_array( [c.indices for c in encoded.chunks], type=encoded.type.index_type diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index eb3c6d6d26101..4274e6e5a911c 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -8,6 +8,7 @@ """ from __future__ import annotations +import inspect import operator from typing import ( TYPE_CHECKING, @@ -20,6 +21,7 @@ cast, overload, ) +import warnings import numpy as np @@ -45,6 +47,7 @@ cache_readonly, deprecate_nonkeyword_arguments, ) +from pandas.util._exceptions import find_stack_level from pandas.util._validators import ( validate_bool_kwarg, validate_fillna_kwargs, @@ -76,6 +79,7 @@ isin, mode, rank, + resolve_na_sentinel, unique, ) from pandas.core.array_algos.quantile import quantile_with_mask @@ -456,6 +460,24 @@ def __ne__(self, other: Any) -> ArrayLike: # type: ignore[override] """ return ~(self == other) + def __init_subclass__(cls, **kwargs): + factorize = getattr(cls, "factorize") + if ( + "use_na_sentinel" not in inspect.signature(factorize).parameters + # TimelikeOps uses old factorize args to ensure we don't break things + and cls.__name__ not in ("TimelikeOps", "DatetimeArray", "TimedeltaArray") + ): + # See GH#46910 for details on the deprecation + name = cls.__name__ + warnings.warn( + f"The `na_sentinel` argument of `{name}.factorize` is deprecated. " + f"In the future, pandas will use the `use_na_sentinel` argument " + f"instead. Add this argument to `{name}.factorize` to be compatible " + f"with future versions of pandas and silence this warning.", + DeprecationWarning, + stacklevel=find_stack_level(), + ) + def to_numpy( self, dtype: npt.DTypeLike | None = None, @@ -1002,7 +1024,11 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]: """ return self.astype(object), np.nan - def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: + def factorize( + self, + na_sentinel: int | lib.NoDefault = lib.no_default, + use_na_sentinel: bool | lib.NoDefault = lib.no_default, + ) -> tuple[np.ndarray, ExtensionArray]: """ Encode the extension array as an enumerated type. @@ -1011,6 +1037,18 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: na_sentinel : int, default -1 Value to use in the `codes` array to indicate missing values. + .. deprecated:: 1.5.0 + The na_sentinel argument is deprecated and + will be removed in a future version of pandas. Specify use_na_sentinel + as either True or False. + + use_na_sentinel : bool, default True + If True, the sentinel -1 will be used for NaN values. If False, + NaN values will be encoded as non-negative integers and will not drop the + NaN from the uniques of the values. + + .. versionadded:: 1.5.0 + Returns ------- codes : ndarray @@ -1041,6 +1079,11 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: # original ExtensionArray. # 2. ExtensionArray.factorize. # Complete control over factorization. + resolved_na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel) + if resolved_na_sentinel is None: + raise NotImplementedError("Encoding NaN values is not yet implemented") + else: + na_sentinel = resolved_na_sentinel arr, na_value = self._values_for_factorize() codes, uniques = factorize_array( diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index f7de291f83d03..d354d28d0f46f 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1996,7 +1996,12 @@ def _with_freq(self, freq): # -------------------------------------------------------------- - def factorize(self, na_sentinel=-1, sort: bool = False): + # GH#46910 - Keep old signature to test we don't break things for EA library authors + def factorize( # type:ignore[override] + self, + na_sentinel: int = -1, + sort: bool = False, + ): if self.freq is not None: # We must be unique, so can short-circuit (and retain freq) codes = np.arange(len(self), dtype=np.intp) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 78c82d9a4e478..0a25be38e81df 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -869,7 +869,16 @@ def searchsorted( return self._data.searchsorted(value, side=side, sorter=sorter) @doc(ExtensionArray.factorize) - def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: + def factorize( + self, + na_sentinel: int | lib.NoDefault = lib.no_default, + use_na_sentinel: bool | lib.NoDefault = lib.no_default, + ) -> tuple[np.ndarray, ExtensionArray]: + resolved_na_sentinel = algos.resolve_na_sentinel(na_sentinel, use_na_sentinel) + if resolved_na_sentinel is None: + raise NotImplementedError("Encoding NaN values is not yet implemented") + else: + na_sentinel = resolved_na_sentinel arr = self._data mask = self._mask diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 0c34229fb5080..26c577886f174 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -848,13 +848,19 @@ def _values_for_factorize(self): # Still override this for hash_pandas_object return np.asarray(self), self.fill_value - def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, SparseArray]: + def factorize( + self, + na_sentinel: int | lib.NoDefault = lib.no_default, + use_na_sentinel: bool | lib.NoDefault = lib.no_default, + ) -> tuple[np.ndarray, SparseArray]: # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA] # The sparsity on this is backwards from what Sparse would want. Want # ExtensionArray.factorize -> Tuple[EA, EA] # Given that we have to return a dense array of codes, why bother # implementing an efficient factorize? - codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel) + codes, uniques = algos.factorize( + np.asarray(self), na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel + ) uniques_sp = SparseArray(uniques, dtype=self.dtype) return codes, uniques_sp diff --git a/pandas/core/base.py b/pandas/core/base.py index b4c2c81ee666f..7541eff9a11d4 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1136,8 +1136,15 @@ def _memory_usage(self, deep: bool = False) -> int: """ ), ) - def factorize(self, sort: bool = False, na_sentinel: int | None = -1): - return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel) + def factorize( + self, + sort: bool = False, + na_sentinel: int | lib.NoDefault = lib.no_default, + use_na_sentinel: bool | lib.NoDefault = lib.no_default, + ): + return algorithms.factorize( + self, sort=sort, na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel + ) _shared_docs[ "searchsorted" diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 05ef155ecbcda..e524021f3a1b8 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -681,13 +681,9 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: codes = self.grouping_vector.codes_info uniques = self.grouping_vector.result_index._values else: - # GH35667, replace dropna=False with na_sentinel=None - if not self._dropna: - na_sentinel = None - else: - na_sentinel = -1 + # GH35667, replace dropna=False with use_na_sentinel=False codes, uniques = algorithms.factorize( - self.grouping_vector, sort=self._sort, na_sentinel=na_sentinel + self.grouping_vector, sort=self._sort, use_na_sentinel=self._dropna ) return codes, uniques diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 0a8df9d64d512..15d06ef3bc8e5 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1518,8 +1518,7 @@ def _get_grouper_for_level( return grouper, None, None values = self.get_level_values(level) - na_sentinel = -1 if dropna else None - codes, uniques = algos.factorize(values, sort=True, na_sentinel=na_sentinel) + codes, uniques = algos.factorize(values, sort=True, use_na_sentinel=dropna) assert isinstance(uniques, Index) if self.levels[level]._can_hold_na: diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index fdb1ee754a7e6..5b384fbc97c1a 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -43,6 +43,7 @@ from pandas.core.dtypes.generic import ABCTimedeltaIndex from pandas.core import ops +from pandas.core.algorithms import resolve_na_sentinel import pandas.core.common as com from pandas.core.construction import extract_array import pandas.core.indexes.base as ibase @@ -510,8 +511,13 @@ def argsort(self, *args, **kwargs) -> npt.NDArray[np.intp]: return result def factorize( - self, sort: bool = False, na_sentinel: int | None = -1 + self, + sort: bool = False, + na_sentinel: int | lib.NoDefault = lib.no_default, + use_na_sentinel: bool | lib.NoDefault = lib.no_default, ) -> tuple[npt.NDArray[np.intp], RangeIndex]: + # resolve to emit warning if appropriate + resolve_na_sentinel(na_sentinel, use_na_sentinel) codes = np.arange(len(self), dtype=np.intp) uniques = self if sort and self.step < 0: diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index b829b017d5fb1..6e9130b18e94f 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -213,7 +213,12 @@ def test_unique(self, data, box, method): @pytest.mark.parametrize("na_sentinel", [-1, -2]) def test_factorize(self, data_for_grouping, na_sentinel): - codes, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) + if na_sentinel == -1: + msg = "Specifying `na_sentinel=-1` is deprecated" + else: + msg = "Specifying the specific value to use for `na_sentinel` is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + codes, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) expected_codes = np.array( [0, 0, na_sentinel, na_sentinel, 1, 1, 0, 2], dtype=np.intp ) @@ -224,8 +229,15 @@ def test_factorize(self, data_for_grouping, na_sentinel): @pytest.mark.parametrize("na_sentinel", [-1, -2]) def test_factorize_equivalence(self, data_for_grouping, na_sentinel): - codes_1, uniques_1 = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) - codes_2, uniques_2 = data_for_grouping.factorize(na_sentinel=na_sentinel) + if na_sentinel == -1: + msg = "Specifying `na_sentinel=-1` is deprecated" + else: + msg = "Specifying the specific value to use for `na_sentinel` is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + codes_1, uniques_1 = pd.factorize( + data_for_grouping, na_sentinel=na_sentinel + ) + codes_2, uniques_2 = data_for_grouping.factorize(na_sentinel=na_sentinel) tm.assert_numpy_array_equal(codes_1, codes_2) self.assert_extension_array_equal(uniques_1, uniques_2) diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index e45bffba944c0..dd067102aba6c 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -177,7 +177,12 @@ class TestMethods(base.BaseMethodsTests): @pytest.mark.parametrize("na_sentinel", [-1, -2]) def test_factorize(self, data_for_grouping, na_sentinel): # override because we only have 2 unique values - labels, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) + if na_sentinel == -1: + msg = "Specifying `na_sentinel=-1` is deprecated" + else: + msg = "Specifying the specific value to use for `na_sentinel` is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + labels, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) expected_labels = np.array( [0, 0, na_sentinel, na_sentinel, 1, 1, 0], dtype=np.intp ) diff --git a/pandas/tests/extension/test_extension.py b/pandas/tests/extension/test_extension.py index 1ed626cd51080..a4b1a4b43ef2b 100644 --- a/pandas/tests/extension/test_extension.py +++ b/pandas/tests/extension/test_extension.py @@ -4,6 +4,7 @@ import numpy as np import pytest +import pandas._testing as tm from pandas.core.arrays import ExtensionArray @@ -24,3 +25,16 @@ def test_errors(self, data, all_arithmetic_operators): op_name = all_arithmetic_operators with pytest.raises(AttributeError): getattr(data, op_name) + + +def test_depr_na_sentinel(): + # GH#46910 + msg = "The `na_sentinel` argument of `MyEA.factorize` is deprecated" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + + class MyEA(ExtensionArray): + def factorize(self, na_sentinel=-1): + pass + + with tm.assert_produces_warning(None): + MyEA() diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 068394adac86c..def63c552e059 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -75,11 +75,11 @@ def test_factorize(self, index_or_series_obj, sort): tm.assert_numpy_array_equal(result_codes, expected_codes) tm.assert_index_equal(result_uniques, expected_uniques, exact=True) - def test_series_factorize_na_sentinel_none(self): + def test_series_factorize_use_na_sentinel_false(self): # GH#35667 values = np.array([1, 2, 1, np.nan]) ser = Series(values) - codes, uniques = ser.factorize(na_sentinel=None) + codes, uniques = ser.factorize(use_na_sentinel=False) expected_codes = np.array([0, 1, 0, 2], dtype=np.intp) expected_uniques = Index([1.0, 2.0, np.nan]) @@ -87,6 +87,20 @@ def test_series_factorize_na_sentinel_none(self): tm.assert_numpy_array_equal(codes, expected_codes) tm.assert_index_equal(uniques, expected_uniques) + @pytest.mark.parametrize("na_sentinel", [None, -1, -10]) + def test_depr_na_sentinel(self, na_sentinel, index_or_series_obj): + # GH#46910 + if na_sentinel is None: + msg = "Specifying `na_sentinel=None` is deprecated" + elif na_sentinel == -1: + msg = "Specifying `na_sentinel=-1` is deprecated" + else: + msg = "Specifying the specific value to use for `na_sentinel` is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + pd.factorize(index_or_series_obj, na_sentinel=na_sentinel) + with tm.assert_produces_warning(FutureWarning, match=msg): + index_or_series_obj.factorize(na_sentinel=na_sentinel) + def test_basic(self): codes, uniques = algos.factorize(["a", "b", "b", "a", "a", "c", "c", "c"]) @@ -418,7 +432,12 @@ def test_parametrized_factorize_na_value(self, data, na_value): ids=["numpy_array", "extension_array"], ) def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): - codes, uniques = algos.factorize(data, sort=sort, na_sentinel=na_sentinel) + if na_sentinel == -1: + msg = "Specifying `na_sentinel=-1` is deprecated" + else: + msg = "the specific value to use for `na_sentinel` is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + codes, uniques = algos.factorize(data, sort=sort, na_sentinel=na_sentinel) if sort: expected_codes = np.array([1, 0, na_sentinel, 1], dtype=np.intp) expected_uniques = algos.safe_sort(uniques) @@ -446,10 +465,10 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): ), ], ) - def test_object_factorize_na_sentinel_none( + def test_object_factorize_use_na_sentinel_false( self, data, expected_codes, expected_uniques ): - codes, uniques = algos.factorize(data, na_sentinel=None) + codes, uniques = algos.factorize(data, use_na_sentinel=False) tm.assert_numpy_array_equal(uniques, expected_uniques) tm.assert_numpy_array_equal(codes, expected_codes) @@ -469,10 +488,10 @@ def test_object_factorize_na_sentinel_none( ), ], ) - def test_int_factorize_na_sentinel_none( + def test_int_factorize_use_na_sentinel_false( self, data, expected_codes, expected_uniques ): - codes, uniques = algos.factorize(data, na_sentinel=None) + codes, uniques = algos.factorize(data, use_na_sentinel=False) tm.assert_numpy_array_equal(uniques, expected_uniques) tm.assert_numpy_array_equal(codes, expected_codes)