Skip to content

Commit 33aacd8

Browse files
rhshadrachmroeschke
authored andcommitted
DEPR: Enforce deprecation of na_sentinel (pandas-dev#49402)
* DEPR: Enforce deprecation of na_sentinel * Fixups * Fix docstring * Update doc/source/whatsnew/v2.0.0.rst Co-authored-by: Matthew Roeschke <[email protected]> * Remove comment Co-authored-by: Matthew Roeschke <[email protected]>
1 parent 6c58ab4 commit 33aacd8

File tree

16 files changed

+78
-277
lines changed

16 files changed

+78
-277
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -441,6 +441,7 @@ Removal of prior version deprecations/changes
441441
- Changed behavior of comparison of a :class:`Timestamp` with a ``datetime.date`` object; these now compare as un-equal and raise on inequality comparisons, matching the ``datetime.datetime`` behavior (:issue:`36131`)
442442
- Enforced deprecation of silently dropping columns that raised a ``TypeError`` in :class:`Series.transform` and :class:`DataFrame.transform` when used with a list or dictionary (:issue:`43740`)
443443
- Change behavior of :meth:`DataFrame.apply` with list-like so that any partial failure will raise an error (:issue:`43740`)
444+
- Removed ``na_sentinel`` argument from :func:`factorize`, :meth:`.Index.factorize`, and :meth:`.ExtensionArray.factorize` (:issue:`47157`)
444445
-
445446

446447
.. ---------------------------------------------------------------------------

pandas/core/algorithms.py

+29-107
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
"""
55
from __future__ import annotations
66

7-
import inspect
87
import operator
98
from textwrap import dedent
109
from typing import (
@@ -524,7 +523,7 @@ def f(c, v):
524523

525524
def factorize_array(
526525
values: np.ndarray,
527-
na_sentinel: int | None = -1,
526+
use_na_sentinel: bool = True,
528527
size_hint: int | None = None,
529528
na_value: object = None,
530529
mask: npt.NDArray[np.bool_] | None = None,
@@ -537,7 +536,10 @@ def factorize_array(
537536
Parameters
538537
----------
539538
values : ndarray
540-
na_sentinel : int, default -1
539+
use_na_sentinel : bool, default True
540+
If True, the sentinel -1 will be used for NaN values. If False,
541+
NaN values will be encoded as non-negative integers and will not drop the
542+
NaN from the uniques of the values.
541543
size_hint : int, optional
542544
Passed through to the hashtable's 'get_labels' method
543545
na_value : object, optional
@@ -555,10 +557,6 @@ def factorize_array(
555557
codes : ndarray[np.intp]
556558
uniques : ndarray
557559
"""
558-
ignore_na = na_sentinel is not None
559-
if not ignore_na:
560-
na_sentinel = -1
561-
562560
original = values
563561
if values.dtype.kind in ["m", "M"]:
564562
# _get_hashtable_algo will cast dt64/td64 to i8 via _ensure_data, so we
@@ -572,10 +570,10 @@ def factorize_array(
572570
table = hash_klass(size_hint or len(values))
573571
uniques, codes = table.factorize(
574572
values,
575-
na_sentinel=na_sentinel,
573+
na_sentinel=-1,
576574
na_value=na_value,
577575
mask=mask,
578-
ignore_na=ignore_na,
576+
ignore_na=use_na_sentinel,
579577
)
580578

581579
# re-cast e.g. i8->dt64/td64, uint8->bool
@@ -610,8 +608,7 @@ def factorize_array(
610608
def factorize(
611609
values,
612610
sort: bool = False,
613-
na_sentinel: int | None | lib.NoDefault = lib.no_default,
614-
use_na_sentinel: bool | lib.NoDefault = lib.no_default,
611+
use_na_sentinel: bool = True,
615612
size_hint: int | None = None,
616613
) -> tuple[np.ndarray, np.ndarray | Index]:
617614
"""
@@ -625,17 +622,6 @@ def factorize(
625622
Parameters
626623
----------
627624
{values}{sort}
628-
na_sentinel : int or None, default -1
629-
Value to mark "not found". If None, will not drop the NaN
630-
from the uniques of the values.
631-
632-
.. deprecated:: 1.5.0
633-
The na_sentinel argument is deprecated and
634-
will be removed in a future version of pandas. Specify use_na_sentinel as
635-
either True or False.
636-
637-
.. versionchanged:: 1.1.2
638-
639625
use_na_sentinel : bool, default True
640626
If True, the sentinel -1 will be used for NaN values. If False,
641627
NaN values will be encoded as non-negative integers and will not drop the
@@ -748,12 +734,6 @@ def factorize(
748734
# Step 2 is dispatched to extension types (like Categorical). They are
749735
# responsible only for factorization. All data coercion, sorting and boxing
750736
# should happen here.
751-
752-
# GH#46910 deprecated na_sentinel in favor of use_na_sentinel:
753-
# na_sentinel=None corresponds to use_na_sentinel=False
754-
# na_sentinel=-1 correspond to use_na_sentinel=True
755-
# Other na_sentinel values will not be supported when the deprecation is enforced.
756-
na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel)
757737
if isinstance(values, ABCRangeIndex):
758738
return values.factorize(sort=sort)
759739

@@ -772,25 +752,12 @@ def factorize(
772752
return _re_wrap_factorize(original, uniques, codes)
773753

774754
elif not isinstance(values.dtype, np.dtype):
775-
if (
776-
na_sentinel == -1 or na_sentinel is None
777-
) and "use_na_sentinel" in inspect.signature(values.factorize).parameters:
778-
# Avoid using catch_warnings when possible
779-
# GH#46910 - TimelikeOps has deprecated signature
780-
codes, uniques = values.factorize( # type: ignore[call-arg]
781-
use_na_sentinel=na_sentinel is not None
782-
)
783-
else:
784-
na_sentinel_arg = -1 if na_sentinel is None else na_sentinel
785-
with warnings.catch_warnings():
786-
# We've already warned above
787-
warnings.filterwarnings("ignore", ".*use_na_sentinel.*", FutureWarning)
788-
codes, uniques = values.factorize(na_sentinel=na_sentinel_arg)
755+
codes, uniques = values.factorize(use_na_sentinel=use_na_sentinel)
789756

790757
else:
791758
values = np.asarray(values) # convert DTA/TDA/MultiIndex
792759

793-
if na_sentinel is None and is_object_dtype(values):
760+
if not use_na_sentinel and is_object_dtype(values):
794761
# factorize can now handle differentiating various types of null values.
795762
# These can only occur when the array has object dtype.
796763
# However, for backwards compatibility we only use the null for the
@@ -803,70 +770,24 @@ def factorize(
803770

804771
codes, uniques = factorize_array(
805772
values,
806-
na_sentinel=na_sentinel,
773+
use_na_sentinel=use_na_sentinel,
807774
size_hint=size_hint,
808775
)
809776

810777
if sort and len(uniques) > 0:
811778
uniques, codes = safe_sort(
812-
uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False
779+
uniques,
780+
codes,
781+
use_na_sentinel=use_na_sentinel,
782+
assume_unique=True,
783+
verify=False,
813784
)
814785

815786
uniques = _reconstruct_data(uniques, original.dtype, original)
816787

817788
return _re_wrap_factorize(original, uniques, codes)
818789

819790

820-
def resolve_na_sentinel(
821-
na_sentinel: int | None | lib.NoDefault,
822-
use_na_sentinel: bool | lib.NoDefault,
823-
) -> int | None:
824-
"""
825-
Determine value of na_sentinel for factorize methods.
826-
827-
See GH#46910 for details on the deprecation.
828-
829-
Parameters
830-
----------
831-
na_sentinel : int, None, or lib.no_default
832-
Value passed to the method.
833-
use_na_sentinel : bool or lib.no_default
834-
Value passed to the method.
835-
836-
Returns
837-
-------
838-
Resolved value of na_sentinel.
839-
"""
840-
if na_sentinel is not lib.no_default and use_na_sentinel is not lib.no_default:
841-
raise ValueError(
842-
"Cannot specify both `na_sentinel` and `use_na_sentile`; "
843-
f"got `na_sentinel={na_sentinel}` and `use_na_sentinel={use_na_sentinel}`"
844-
)
845-
if na_sentinel is lib.no_default:
846-
result = -1 if use_na_sentinel is lib.no_default or use_na_sentinel else None
847-
else:
848-
if na_sentinel is None:
849-
msg = (
850-
"Specifying `na_sentinel=None` is deprecated, specify "
851-
"`use_na_sentinel=False` instead."
852-
)
853-
elif na_sentinel == -1:
854-
msg = (
855-
"Specifying `na_sentinel=-1` is deprecated, specify "
856-
"`use_na_sentinel=True` instead."
857-
)
858-
else:
859-
msg = (
860-
"Specifying the specific value to use for `na_sentinel` is "
861-
"deprecated and will be removed in a future version of pandas. "
862-
"Specify `use_na_sentinel=True` to use the sentinel value -1, and "
863-
"`use_na_sentinel=False` to encode NaN values."
864-
)
865-
warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())
866-
result = na_sentinel
867-
return result
868-
869-
870791
def _re_wrap_factorize(original, uniques, codes: np.ndarray):
871792
"""
872793
Wrap factorize results in Series or Index depending on original type.
@@ -1764,7 +1685,7 @@ def diff(arr, n: int, axis: AxisInt = 0):
17641685
def safe_sort(
17651686
values,
17661687
codes=None,
1767-
na_sentinel: int | None = -1,
1688+
use_na_sentinel: bool = True,
17681689
assume_unique: bool = False,
17691690
verify: bool = True,
17701691
) -> AnyArrayLike | tuple[AnyArrayLike, np.ndarray]:
@@ -1780,16 +1701,17 @@ def safe_sort(
17801701
Sequence; must be unique if ``codes`` is not None.
17811702
codes : list_like, optional
17821703
Indices to ``values``. All out of bound indices are treated as
1783-
"not found" and will be masked with ``na_sentinel``.
1784-
na_sentinel : int or None, default -1
1785-
Value in ``codes`` to mark "not found", or None to encode null values as normal.
1786-
Ignored when ``codes`` is None.
1704+
"not found" and will be masked with ``-1``.
1705+
use_na_sentinel : bool, default True
1706+
If True, the sentinel -1 will be used for NaN values. If False,
1707+
NaN values will be encoded as non-negative integers and will not drop the
1708+
NaN from the uniques of the values.
17871709
assume_unique : bool, default False
17881710
When True, ``values`` are assumed to be unique, which can speed up
17891711
the calculation. Ignored when ``codes`` is None.
17901712
verify : bool, default True
17911713
Check if codes are out of bound for the values and put out of bound
1792-
codes equal to na_sentinel. If ``verify=False``, it is assumed there
1714+
codes equal to ``-1``. If ``verify=False``, it is assumed there
17931715
are no out of bound codes. Ignored when ``codes`` is None.
17941716
17951717
.. versionadded:: 0.25.0
@@ -1867,7 +1789,7 @@ def safe_sort(
18671789
t.map_locations(values)
18681790
sorter = ensure_platform_int(t.lookup(ordered))
18691791

1870-
if na_sentinel == -1:
1792+
if use_na_sentinel:
18711793
# take_nd is faster, but only works for na_sentinels of -1
18721794
order2 = sorter.argsort()
18731795
new_codes = take_nd(order2, codes, fill_value=-1)
@@ -1878,17 +1800,17 @@ def safe_sort(
18781800
else:
18791801
reverse_indexer = np.empty(len(sorter), dtype=np.int_)
18801802
reverse_indexer.put(sorter, np.arange(len(sorter)))
1881-
# Out of bound indices will be masked with `na_sentinel` next, so we
1803+
# Out of bound indices will be masked with `-1` next, so we
18821804
# may deal with them here without performance loss using `mode='wrap'`
18831805
new_codes = reverse_indexer.take(codes, mode="wrap")
18841806

1885-
if na_sentinel is not None:
1886-
mask = codes == na_sentinel
1807+
if use_na_sentinel:
1808+
mask = codes == -1
18871809
if verify:
18881810
mask = mask | (codes < -len(values)) | (codes >= len(values))
18891811

1890-
if na_sentinel is not None and mask is not None:
1891-
np.putmask(new_codes, mask, na_sentinel)
1812+
if use_na_sentinel and mask is not None:
1813+
np.putmask(new_codes, mask, -1)
18921814

18931815
return ordered, ensure_platform_int(new_codes)
18941816

pandas/core/arrays/arrow/array.py

+3-10
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88

99
import numpy as np
1010

11-
from pandas._libs import lib
1211
from pandas._typing import (
1312
Dtype,
1413
PositionalIndexer,
@@ -31,7 +30,6 @@
3130
)
3231
from pandas.core.dtypes.missing import isna
3332

34-
from pandas.core.algorithms import resolve_na_sentinel
3533
from pandas.core.arraylike import OpsMixin
3634
from pandas.core.arrays.base import ExtensionArray
3735
from pandas.core.indexers import (
@@ -553,22 +551,17 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]:
553551
@doc(ExtensionArray.factorize)
554552
def factorize(
555553
self,
556-
na_sentinel: int | lib.NoDefault = lib.no_default,
557-
use_na_sentinel: bool | lib.NoDefault = lib.no_default,
554+
use_na_sentinel: bool = True,
558555
) -> tuple[np.ndarray, ExtensionArray]:
559-
resolved_na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel)
560-
null_encoding = "mask" if resolved_na_sentinel is not None else "encode"
556+
null_encoding = "mask" if use_na_sentinel else "encode"
561557
encoded = self._data.dictionary_encode(null_encoding=null_encoding)
562558
if encoded.length() == 0:
563559
indices = np.array([], dtype=np.intp)
564560
uniques = type(self)(pa.chunked_array([], type=encoded.type.value_type))
565561
else:
566562
pa_indices = encoded.combine_chunks().indices
567563
if pa_indices.null_count > 0:
568-
fill_value = (
569-
resolved_na_sentinel if resolved_na_sentinel is not None else -1
570-
)
571-
pa_indices = pc.fill_null(pa_indices, fill_value)
564+
pa_indices = pc.fill_null(pa_indices, -1)
572565
indices = pa_indices.to_numpy(zero_copy_only=False, writable=True).astype(
573566
np.intp, copy=False
574567
)

0 commit comments

Comments
 (0)