diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index c76555f9ef417..b97d0f5232f1e 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -441,6 +441,7 @@ Removal of prior version deprecations/changes - Changed behavior of comparison of a :class:`Timestamp` with a ``datetime.date`` object; these now compare as un-equal and raise on inequality comparisons, matching the ``datetime.datetime`` behavior (:issue:`36131`) - Enforced deprecation of silently dropping columns that raised a ``TypeError`` in :class:`Series.transform` and :class:`DataFrame.transform` when used with a list or dictionary (:issue:`43740`) - Change behavior of :meth:`DataFrame.apply` with list-like so that any partial failure will raise an error (:issue:`43740`) +- Removed ``na_sentinel`` argument from :func:`factorize`, :meth:`.Index.factorize`, and :meth:`.ExtensionArray.factorize` (:issue:`47157`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index aca5c4345d247..7494a8a54f9bb 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -4,7 +4,6 @@ """ from __future__ import annotations -import inspect import operator from textwrap import dedent from typing import ( @@ -524,7 +523,7 @@ def f(c, v): def factorize_array( values: np.ndarray, - na_sentinel: int | None = -1, + use_na_sentinel: bool = True, size_hint: int | None = None, na_value: object = None, mask: npt.NDArray[np.bool_] | None = None, @@ -537,7 +536,10 @@ def factorize_array( Parameters ---------- values : ndarray - na_sentinel : int, default -1 + use_na_sentinel : bool, default True + If True, the sentinel -1 will be used for NaN values. If False, + NaN values will be encoded as non-negative integers and will not drop the + NaN from the uniques of the values. size_hint : int, optional Passed through to the hashtable's 'get_labels' method na_value : object, optional @@ -555,10 +557,6 @@ def factorize_array( codes : ndarray[np.intp] uniques : ndarray """ - ignore_na = na_sentinel is not None - if not ignore_na: - na_sentinel = -1 - original = values if values.dtype.kind in ["m", "M"]: # _get_hashtable_algo will cast dt64/td64 to i8 via _ensure_data, so we @@ -572,10 +570,10 @@ def factorize_array( table = hash_klass(size_hint or len(values)) uniques, codes = table.factorize( values, - na_sentinel=na_sentinel, + na_sentinel=-1, na_value=na_value, mask=mask, - ignore_na=ignore_na, + ignore_na=use_na_sentinel, ) # re-cast e.g. i8->dt64/td64, uint8->bool @@ -610,8 +608,7 @@ def factorize_array( def factorize( values, sort: bool = False, - na_sentinel: int | None | lib.NoDefault = lib.no_default, - use_na_sentinel: bool | lib.NoDefault = lib.no_default, + use_na_sentinel: bool = True, size_hint: int | None = None, ) -> tuple[np.ndarray, np.ndarray | Index]: """ @@ -625,17 +622,6 @@ def factorize( Parameters ---------- {values}{sort} - na_sentinel : int or None, default -1 - Value to mark "not found". If None, will not drop the NaN - from the uniques of the values. - - .. deprecated:: 1.5.0 - The na_sentinel argument is deprecated and - will be removed in a future version of pandas. Specify use_na_sentinel as - either True or False. - - .. versionchanged:: 1.1.2 - use_na_sentinel : bool, default True If True, the sentinel -1 will be used for NaN values. If False, NaN values will be encoded as non-negative integers and will not drop the @@ -748,12 +734,6 @@ def factorize( # Step 2 is dispatched to extension types (like Categorical). They are # responsible only for factorization. All data coercion, sorting and boxing # should happen here. - - # GH#46910 deprecated na_sentinel in favor of use_na_sentinel: - # na_sentinel=None corresponds to use_na_sentinel=False - # na_sentinel=-1 correspond to use_na_sentinel=True - # Other na_sentinel values will not be supported when the deprecation is enforced. - na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel) if isinstance(values, ABCRangeIndex): return values.factorize(sort=sort) @@ -772,25 +752,12 @@ def factorize( return _re_wrap_factorize(original, uniques, codes) elif not isinstance(values.dtype, np.dtype): - if ( - na_sentinel == -1 or na_sentinel is None - ) and "use_na_sentinel" in inspect.signature(values.factorize).parameters: - # Avoid using catch_warnings when possible - # GH#46910 - TimelikeOps has deprecated signature - codes, uniques = values.factorize( # type: ignore[call-arg] - use_na_sentinel=na_sentinel is not None - ) - else: - na_sentinel_arg = -1 if na_sentinel is None else na_sentinel - with warnings.catch_warnings(): - # We've already warned above - warnings.filterwarnings("ignore", ".*use_na_sentinel.*", FutureWarning) - codes, uniques = values.factorize(na_sentinel=na_sentinel_arg) + codes, uniques = values.factorize(use_na_sentinel=use_na_sentinel) else: values = np.asarray(values) # convert DTA/TDA/MultiIndex - if na_sentinel is None and is_object_dtype(values): + if not use_na_sentinel and is_object_dtype(values): # factorize can now handle differentiating various types of null values. # These can only occur when the array has object dtype. # However, for backwards compatibility we only use the null for the @@ -803,13 +770,17 @@ def factorize( codes, uniques = factorize_array( values, - na_sentinel=na_sentinel, + use_na_sentinel=use_na_sentinel, size_hint=size_hint, ) if sort and len(uniques) > 0: uniques, codes = safe_sort( - uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False + uniques, + codes, + use_na_sentinel=use_na_sentinel, + assume_unique=True, + verify=False, ) uniques = _reconstruct_data(uniques, original.dtype, original) @@ -817,56 +788,6 @@ def factorize( return _re_wrap_factorize(original, uniques, codes) -def resolve_na_sentinel( - na_sentinel: int | None | lib.NoDefault, - use_na_sentinel: bool | lib.NoDefault, -) -> int | None: - """ - Determine value of na_sentinel for factorize methods. - - See GH#46910 for details on the deprecation. - - Parameters - ---------- - na_sentinel : int, None, or lib.no_default - Value passed to the method. - use_na_sentinel : bool or lib.no_default - Value passed to the method. - - Returns - ------- - Resolved value of na_sentinel. - """ - if na_sentinel is not lib.no_default and use_na_sentinel is not lib.no_default: - raise ValueError( - "Cannot specify both `na_sentinel` and `use_na_sentile`; " - f"got `na_sentinel={na_sentinel}` and `use_na_sentinel={use_na_sentinel}`" - ) - if na_sentinel is lib.no_default: - result = -1 if use_na_sentinel is lib.no_default or use_na_sentinel else None - else: - if na_sentinel is None: - msg = ( - "Specifying `na_sentinel=None` is deprecated, specify " - "`use_na_sentinel=False` instead." - ) - elif na_sentinel == -1: - msg = ( - "Specifying `na_sentinel=-1` is deprecated, specify " - "`use_na_sentinel=True` instead." - ) - else: - msg = ( - "Specifying the specific value to use for `na_sentinel` is " - "deprecated and will be removed in a future version of pandas. " - "Specify `use_na_sentinel=True` to use the sentinel value -1, and " - "`use_na_sentinel=False` to encode NaN values." - ) - warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) - result = na_sentinel - return result - - def _re_wrap_factorize(original, uniques, codes: np.ndarray): """ Wrap factorize results in Series or Index depending on original type. @@ -1764,7 +1685,7 @@ def diff(arr, n: int, axis: AxisInt = 0): def safe_sort( values, codes=None, - na_sentinel: int | None = -1, + use_na_sentinel: bool = True, assume_unique: bool = False, verify: bool = True, ) -> AnyArrayLike | tuple[AnyArrayLike, np.ndarray]: @@ -1780,16 +1701,17 @@ def safe_sort( Sequence; must be unique if ``codes`` is not None. codes : list_like, optional Indices to ``values``. All out of bound indices are treated as - "not found" and will be masked with ``na_sentinel``. - na_sentinel : int or None, default -1 - Value in ``codes`` to mark "not found", or None to encode null values as normal. - Ignored when ``codes`` is None. + "not found" and will be masked with ``-1``. + use_na_sentinel : bool, default True + If True, the sentinel -1 will be used for NaN values. If False, + NaN values will be encoded as non-negative integers and will not drop the + NaN from the uniques of the values. assume_unique : bool, default False When True, ``values`` are assumed to be unique, which can speed up the calculation. Ignored when ``codes`` is None. verify : bool, default True Check if codes are out of bound for the values and put out of bound - codes equal to na_sentinel. If ``verify=False``, it is assumed there + codes equal to ``-1``. If ``verify=False``, it is assumed there are no out of bound codes. Ignored when ``codes`` is None. .. versionadded:: 0.25.0 @@ -1867,7 +1789,7 @@ def safe_sort( t.map_locations(values) sorter = ensure_platform_int(t.lookup(ordered)) - if na_sentinel == -1: + if use_na_sentinel: # take_nd is faster, but only works for na_sentinels of -1 order2 = sorter.argsort() new_codes = take_nd(order2, codes, fill_value=-1) @@ -1878,17 +1800,17 @@ def safe_sort( else: reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) - # Out of bound indices will be masked with `na_sentinel` next, so we + # Out of bound indices will be masked with `-1` next, so we # may deal with them here without performance loss using `mode='wrap'` new_codes = reverse_indexer.take(codes, mode="wrap") - if na_sentinel is not None: - mask = codes == na_sentinel + if use_na_sentinel: + mask = codes == -1 if verify: mask = mask | (codes < -len(values)) | (codes >= len(values)) - if na_sentinel is not None and mask is not None: - np.putmask(new_codes, mask, na_sentinel) + if use_na_sentinel and mask is not None: + np.putmask(new_codes, mask, -1) return ordered, ensure_platform_int(new_codes) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 945ae52c53047..06d91730804ab 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -8,7 +8,6 @@ import numpy as np -from pandas._libs import lib from pandas._typing import ( Dtype, PositionalIndexer, @@ -31,7 +30,6 @@ ) from pandas.core.dtypes.missing import isna -from pandas.core.algorithms import resolve_na_sentinel from pandas.core.arraylike import OpsMixin from pandas.core.arrays.base import ExtensionArray from pandas.core.indexers import ( @@ -553,11 +551,9 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]: @doc(ExtensionArray.factorize) def factorize( self, - na_sentinel: int | lib.NoDefault = lib.no_default, - use_na_sentinel: bool | lib.NoDefault = lib.no_default, + use_na_sentinel: bool = True, ) -> tuple[np.ndarray, ExtensionArray]: - resolved_na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel) - null_encoding = "mask" if resolved_na_sentinel is not None else "encode" + null_encoding = "mask" if use_na_sentinel else "encode" encoded = self._data.dictionary_encode(null_encoding=null_encoding) if encoded.length() == 0: indices = np.array([], dtype=np.intp) @@ -565,10 +561,7 @@ def factorize( else: pa_indices = encoded.combine_chunks().indices if pa_indices.null_count > 0: - fill_value = ( - resolved_na_sentinel if resolved_na_sentinel is not None else -1 - ) - pa_indices = pc.fill_null(pa_indices, fill_value) + pa_indices = pc.fill_null(pa_indices, -1) indices = pa_indices.to_numpy(zero_copy_only=False, writable=True).astype( np.intp, copy=False ) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index cc9b2ce3fed42..e536ee434fa55 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -8,7 +8,6 @@ """ from __future__ import annotations -import inspect import operator from typing import ( TYPE_CHECKING, @@ -22,7 +21,6 @@ cast, overload, ) -import warnings import numpy as np @@ -49,7 +47,6 @@ Substitution, cache_readonly, ) -from pandas.util._exceptions import find_stack_level from pandas.util._validators import ( validate_bool_kwarg, validate_fillna_kwargs, @@ -81,7 +78,6 @@ isin, mode, rank, - resolve_na_sentinel, unique, ) from pandas.core.array_algos.quantile import quantile_with_mask @@ -454,24 +450,6 @@ def __ne__(self, other: Any) -> ArrayLike: # type: ignore[override] """ return ~(self == other) - def __init_subclass__(cls, **kwargs) -> None: - factorize = getattr(cls, "factorize") - if ( - "use_na_sentinel" not in inspect.signature(factorize).parameters - # TimelikeOps uses old factorize args to ensure we don't break things - and cls.__name__ not in ("TimelikeOps", "DatetimeArray", "TimedeltaArray") - ): - # See GH#46910 for details on the deprecation - name = cls.__name__ - warnings.warn( - f"The `na_sentinel` argument of `{name}.factorize` is deprecated. " - f"In the future, pandas will use the `use_na_sentinel` argument " - f"instead. Add this argument to `{name}.factorize` to be compatible " - f"with future versions of pandas and silence this warning.", - DeprecationWarning, - stacklevel=find_stack_level(), - ) - def to_numpy( self, dtype: npt.DTypeLike | None = None, @@ -1009,7 +987,7 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]: na_value : object The value in `values` to consider missing. This will be treated as NA in the factorization routines, so it will be coded as - `na_sentinel` and not included in `uniques`. By default, + `-1` and not included in `uniques`. By default, ``np.nan`` is used. Notes @@ -1021,22 +999,13 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]: def factorize( self, - na_sentinel: int | lib.NoDefault = lib.no_default, - use_na_sentinel: bool | lib.NoDefault = lib.no_default, + use_na_sentinel: bool = True, ) -> tuple[np.ndarray, ExtensionArray]: """ Encode the extension array as an enumerated type. Parameters ---------- - na_sentinel : int, default -1 - Value to use in the `codes` array to indicate missing values. - - .. deprecated:: 1.5.0 - The na_sentinel argument is deprecated and - will be removed in a future version of pandas. Specify use_na_sentinel - as either True or False. - use_na_sentinel : bool, default True If True, the sentinel -1 will be used for NaN values. If False, NaN values will be encoded as non-negative integers and will not drop the @@ -1074,11 +1043,10 @@ def factorize( # original ExtensionArray. # 2. ExtensionArray.factorize. # Complete control over factorization. - resolved_na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel) arr, na_value = self._values_for_factorize() codes, uniques = factorize_array( - arr, na_sentinel=resolved_na_sentinel, na_value=na_value + arr, use_na_sentinel=use_na_sentinel, na_value=na_value ) uniques_ea = self._from_factorized(uniques, self) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index e82045eee6143..f98fbfe429871 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -2144,10 +2144,9 @@ def _with_freq(self, freq): # -------------------------------------------------------------- - # GH#46910 - Keep old signature to test we don't break things for EA library authors - def factorize( # type:ignore[override] + def factorize( self, - na_sentinel: int = -1, + use_na_sentinel: bool = True, sort: bool = False, ): if self.freq is not None: @@ -2159,7 +2158,7 @@ def factorize( # type:ignore[override] uniques = uniques[::-1] return codes, uniques # FIXME: shouldn't get here; we are ignoring sort - return super().factorize(na_sentinel=na_sentinel) + return super().factorize(use_na_sentinel=use_na_sentinel) # ------------------------------------------------------------------- diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index e74bd2a25bc5e..9968ebc826575 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -901,28 +901,25 @@ def searchsorted( @doc(ExtensionArray.factorize) def factorize( self, - na_sentinel: int | lib.NoDefault = lib.no_default, - use_na_sentinel: bool | lib.NoDefault = lib.no_default, + use_na_sentinel: bool = True, ) -> tuple[np.ndarray, ExtensionArray]: - resolved_na_sentinel = algos.resolve_na_sentinel(na_sentinel, use_na_sentinel) arr = self._data mask = self._mask - # Pass non-None na_sentinel; recode and add NA to uniques if necessary below - na_sentinel_arg = -1 if resolved_na_sentinel is None else resolved_na_sentinel - codes, uniques = factorize_array(arr, na_sentinel=na_sentinel_arg, mask=mask) + # Use a sentinel for na; recode and add NA to uniques if necessary below + codes, uniques = factorize_array(arr, use_na_sentinel=True, mask=mask) # check that factorize_array correctly preserves dtype. assert uniques.dtype == self.dtype.numpy_dtype, (uniques.dtype, self.dtype) has_na = mask.any() - if resolved_na_sentinel is not None or not has_na: + if use_na_sentinel or not has_na: size = len(uniques) else: # Make room for an NA value size = len(uniques) + 1 uniques_mask = np.zeros(size, dtype=bool) - if resolved_na_sentinel is None and has_na: + if not use_na_sentinel and has_na: na_index = mask.argmax() # Insert na with the proper code if na_index == 0: diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 93d6ac0ef6e06..d10b3a216c215 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -873,8 +873,7 @@ def _values_for_factorize(self): def factorize( self, - na_sentinel: int | lib.NoDefault = lib.no_default, - use_na_sentinel: bool | lib.NoDefault = lib.no_default, + use_na_sentinel: bool = True, ) -> tuple[np.ndarray, SparseArray]: # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA] # The sparsity on this is backwards from what Sparse would want. Want @@ -882,12 +881,8 @@ def factorize( # Given that we have to return a dense array of codes, why bother # implementing an efficient factorize? codes, uniques = algos.factorize( - np.asarray(self), na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel + np.asarray(self), use_na_sentinel=use_na_sentinel ) - if na_sentinel is lib.no_default: - na_sentinel = -1 - if use_na_sentinel is lib.no_default or use_na_sentinel: - codes[codes == -1] = na_sentinel uniques_sp = SparseArray(uniques, dtype=self.dtype) return codes, uniques_sp diff --git a/pandas/core/base.py b/pandas/core/base.py index afcab23e130cd..46803e1f28975 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1138,12 +1138,9 @@ def _memory_usage(self, deep: bool = False) -> int: def factorize( self, sort: bool = False, - na_sentinel: int | lib.NoDefault = lib.no_default, - use_na_sentinel: bool | lib.NoDefault = lib.no_default, + use_na_sentinel: bool = True, ): - return algorithms.factorize( - self, sort=sort, na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel - ) + return algorithms.factorize(self, sort=sort, use_na_sentinel=use_na_sentinel) _shared_docs[ "searchsorted" diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index f15c244d8b628..ae88b85aa06e1 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -43,7 +43,6 @@ from pandas.core.dtypes.generic import ABCTimedeltaIndex from pandas.core import ops -from pandas.core.algorithms import resolve_na_sentinel import pandas.core.common as com from pandas.core.construction import extract_array import pandas.core.indexes.base as ibase @@ -457,11 +456,8 @@ def argsort(self, *args, **kwargs) -> npt.NDArray[np.intp]: def factorize( self, sort: bool = False, - na_sentinel: int | lib.NoDefault = lib.no_default, - use_na_sentinel: bool | lib.NoDefault = lib.no_default, + use_na_sentinel: bool = True, ) -> tuple[npt.NDArray[np.intp], RangeIndex]: - # resolve to emit warning if appropriate - resolve_na_sentinel(na_sentinel, use_na_sentinel) codes = np.arange(len(self), dtype=np.intp) uniques = self if sort and self.step < 0: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index f46cf6085b06d..74a1051825820 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2404,7 +2404,7 @@ def _sort_labels( llength = len(left) labels = np.concatenate([left, right]) - _, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1) + _, new_labels = algos.safe_sort(uniques, labels, use_na_sentinel=True) new_left, new_right = new_labels[:llength], new_labels[llength:] return new_left, new_right diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 838c9f5b8a35f..2df410dff2b00 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -211,33 +211,17 @@ def test_unique(self, data, box, method): assert isinstance(result, type(data)) assert result[0] == duplicated[0] - @pytest.mark.parametrize("na_sentinel", [-1, -2]) - def test_factorize(self, data_for_grouping, na_sentinel): - if na_sentinel == -1: - msg = "Specifying `na_sentinel=-1` is deprecated" - else: - msg = "Specifying the specific value to use for `na_sentinel` is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - codes, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) - expected_codes = np.array( - [0, 0, na_sentinel, na_sentinel, 1, 1, 0, 2], dtype=np.intp - ) + def test_factorize(self, data_for_grouping): + codes, uniques = pd.factorize(data_for_grouping, use_na_sentinel=True) + expected_codes = np.array([0, 0, -1, -1, 1, 1, 0, 2], dtype=np.intp) expected_uniques = data_for_grouping.take([0, 4, 7]) tm.assert_numpy_array_equal(codes, expected_codes) self.assert_extension_array_equal(uniques, expected_uniques) - @pytest.mark.parametrize("na_sentinel", [-1, -2]) - def test_factorize_equivalence(self, data_for_grouping, na_sentinel): - if na_sentinel == -1: - msg = "Specifying `na_sentinel=-1` is deprecated" - else: - msg = "Specifying the specific value to use for `na_sentinel` is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - codes_1, uniques_1 = pd.factorize( - data_for_grouping, na_sentinel=na_sentinel - ) - codes_2, uniques_2 = data_for_grouping.factorize(na_sentinel=na_sentinel) + def test_factorize_equivalence(self, data_for_grouping): + codes_1, uniques_1 = pd.factorize(data_for_grouping, use_na_sentinel=True) + codes_2, uniques_2 = data_for_grouping.factorize(use_na_sentinel=True) tm.assert_numpy_array_equal(codes_1, codes_2) self.assert_extension_array_equal(uniques_1, uniques_2) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index f68e38be44811..d44944c74f9d5 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -871,8 +871,7 @@ def test_unique(self, data, box, method, request): ) super().test_unique(data, box, method) - @pytest.mark.parametrize("na_sentinel", [-1, -2]) - def test_factorize(self, data_for_grouping, na_sentinel, request): + def test_factorize(self, data_for_grouping, request): pa_dtype = data_for_grouping.dtype.pyarrow_dtype if pa.types.is_duration(pa_dtype): request.node.add_marker( @@ -887,10 +886,9 @@ def test_factorize(self, data_for_grouping, na_sentinel, request): reason=f"{pa_dtype} only has 2 unique possible values", ) ) - super().test_factorize(data_for_grouping, na_sentinel) + super().test_factorize(data_for_grouping) - @pytest.mark.parametrize("na_sentinel", [-1, -2]) - def test_factorize_equivalence(self, data_for_grouping, na_sentinel, request): + def test_factorize_equivalence(self, data_for_grouping, request): pa_dtype = data_for_grouping.dtype.pyarrow_dtype if pa.types.is_duration(pa_dtype): request.node.add_marker( @@ -899,7 +897,7 @@ def test_factorize_equivalence(self, data_for_grouping, na_sentinel, request): reason=f"dictionary_encode has no pyarrow kernel for {pa_dtype}", ) ) - super().test_factorize_equivalence(data_for_grouping, na_sentinel) + super().test_factorize_equivalence(data_for_grouping) def test_factorize_empty(self, data, request): pa_dtype = data.dtype.pyarrow_dtype diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index dd067102aba6c..b846028dab947 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -174,18 +174,10 @@ class TestReshaping(base.BaseReshapingTests): class TestMethods(base.BaseMethodsTests): - @pytest.mark.parametrize("na_sentinel", [-1, -2]) - def test_factorize(self, data_for_grouping, na_sentinel): + def test_factorize(self, data_for_grouping): # override because we only have 2 unique values - if na_sentinel == -1: - msg = "Specifying `na_sentinel=-1` is deprecated" - else: - msg = "Specifying the specific value to use for `na_sentinel` is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - labels, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) - expected_labels = np.array( - [0, 0, na_sentinel, na_sentinel, 1, 1, 0], dtype=np.intp - ) + labels, uniques = pd.factorize(data_for_grouping, use_na_sentinel=True) + expected_labels = np.array([0, 0, -1, -1, 1, 1, 0], dtype=np.intp) expected_uniques = data_for_grouping.take([0, 4]) tm.assert_numpy_array_equal(labels, expected_labels) diff --git a/pandas/tests/extension/test_extension.py b/pandas/tests/extension/test_extension.py index a4b1a4b43ef2b..1ed626cd51080 100644 --- a/pandas/tests/extension/test_extension.py +++ b/pandas/tests/extension/test_extension.py @@ -4,7 +4,6 @@ import numpy as np import pytest -import pandas._testing as tm from pandas.core.arrays import ExtensionArray @@ -25,16 +24,3 @@ def test_errors(self, data, all_arithmetic_operators): op_name = all_arithmetic_operators with pytest.raises(AttributeError): getattr(data, op_name) - - -def test_depr_na_sentinel(): - # GH#46910 - msg = "The `na_sentinel` argument of `MyEA.factorize` is deprecated" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - - class MyEA(ExtensionArray): - def factorize(self, na_sentinel=-1): - pass - - with tm.assert_produces_warning(None): - MyEA() diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index c6aefd5bb73b9..d2de6cb7f21a3 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -102,20 +102,6 @@ def test_series_factorize_use_na_sentinel_false(self): tm.assert_numpy_array_equal(codes, expected_codes) tm.assert_index_equal(uniques, expected_uniques) - @pytest.mark.parametrize("na_sentinel", [None, -1, -10]) - def test_depr_na_sentinel(self, na_sentinel, index_or_series_obj): - # GH#46910 - if na_sentinel is None: - msg = "Specifying `na_sentinel=None` is deprecated" - elif na_sentinel == -1: - msg = "Specifying `na_sentinel=-1` is deprecated" - else: - msg = "Specifying the specific value to use for `na_sentinel` is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - pd.factorize(index_or_series_obj, na_sentinel=na_sentinel) - with tm.assert_produces_warning(FutureWarning, match=msg): - index_or_series_obj.factorize(na_sentinel=na_sentinel) - def test_basic(self): codes, uniques = algos.factorize(["a", "b", "b", "a", "a", "c", "c", "c"]) @@ -421,7 +407,6 @@ def test_parametrized_factorize_na_value(self, data, na_value): tm.assert_numpy_array_equal(uniques, expected_uniques) @pytest.mark.parametrize("sort", [True, False]) - @pytest.mark.parametrize("na_sentinel", [-1, -10, 100]) @pytest.mark.parametrize( "data, uniques", [ @@ -436,18 +421,13 @@ def test_parametrized_factorize_na_value(self, data, na_value): ], ids=["numpy_array", "extension_array"], ) - def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): - if na_sentinel == -1: - msg = "Specifying `na_sentinel=-1` is deprecated" - else: - msg = "the specific value to use for `na_sentinel` is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - codes, uniques = algos.factorize(data, sort=sort, na_sentinel=na_sentinel) + def test_factorize_use_na_sentinel(self, sort, data, uniques): + codes, uniques = algos.factorize(data, sort=sort, use_na_sentinel=True) if sort: - expected_codes = np.array([1, 0, na_sentinel, 1], dtype=np.intp) + expected_codes = np.array([1, 0, -1, 1], dtype=np.intp) expected_uniques = algos.safe_sort(uniques) else: - expected_codes = np.array([0, 1, na_sentinel, 0], dtype=np.intp) + expected_codes = np.array([0, 1, -1, 0], dtype=np.intp) expected_uniques = uniques tm.assert_numpy_array_equal(codes, expected_codes) if isinstance(data, np.ndarray): diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index ba1943878cfad..44895cc576fd0 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -412,19 +412,18 @@ def test_basic_sort(self, arg, exp): @pytest.mark.parametrize("verify", [True, False]) @pytest.mark.parametrize( - "codes, exp_codes, na_sentinel", + "codes, exp_codes", [ - [[0, 1, 1, 2, 3, 0, -1, 4], [3, 1, 1, 2, 0, 3, -1, 4], -1], - [[0, 1, 1, 2, 3, 0, 99, 4], [3, 1, 1, 2, 0, 3, 99, 4], 99], - [[], [], -1], + [[0, 1, 1, 2, 3, 0, -1, 4], [3, 1, 1, 2, 0, 3, -1, 4]], + [[], []], ], ) - def test_codes(self, verify, codes, exp_codes, na_sentinel): + def test_codes(self, verify, codes, exp_codes): values = [3, 1, 2, 0, 4] expected = np.array([0, 1, 2, 3, 4]) result, result_codes = safe_sort( - values, codes, na_sentinel=na_sentinel, verify=verify + values, codes, use_na_sentinel=True, verify=verify ) expected_codes = np.array(exp_codes, dtype=np.intp) tm.assert_numpy_array_equal(result, expected) @@ -435,17 +434,14 @@ def test_codes(self, verify, codes, exp_codes, na_sentinel): reason="In CI environment can crash thread with: " "Windows fatal exception: access violation", ) - @pytest.mark.parametrize("na_sentinel", [-1, 99]) - def test_codes_out_of_bound(self, na_sentinel): + def test_codes_out_of_bound(self): values = [3, 1, 2, 0, 4] expected = np.array([0, 1, 2, 3, 4]) # out of bound indices codes = [0, 101, 102, 2, 3, 0, 99, 4] - result, result_codes = safe_sort(values, codes, na_sentinel=na_sentinel) - expected_codes = np.array( - [3, na_sentinel, na_sentinel, 2, 0, 3, na_sentinel, 4], dtype=np.intp - ) + result, result_codes = safe_sort(values, codes, use_na_sentinel=True) + expected_codes = np.array([3, -1, -1, 2, 0, 3, -1, 4], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_codes, expected_codes) @@ -494,14 +490,11 @@ def test_extension_array(self, arg, exp): tm.assert_extension_array_equal(result, expected) @pytest.mark.parametrize("verify", [True, False]) - @pytest.mark.parametrize("na_sentinel", [-1, 99]) - def test_extension_array_codes(self, verify, na_sentinel): + def test_extension_array_codes(self, verify): a = array([1, 3, 2], dtype="Int64") - result, codes = safe_sort( - a, [0, 1, na_sentinel, 2], na_sentinel=na_sentinel, verify=verify - ) + result, codes = safe_sort(a, [0, 1, -1, 2], use_na_sentinel=True, verify=verify) expected_values = array([1, 2, 3], dtype="Int64") - expected_codes = np.array([0, 2, na_sentinel, 1], dtype=np.intp) + expected_codes = np.array([0, 2, -1, 1], dtype=np.intp) tm.assert_extension_array_equal(result, expected_values) tm.assert_numpy_array_equal(codes, expected_codes)