Skip to content

Commit d580826

Browse files
rhshadrachjreback
andauthored
DEPR: na_sentinel in factorize (#47157)
* DEPR: na_sentinel in factorize * WIP * DEPR: na_sentinel in factorize * Fixups * Fixups * black * fixup * docs * newline * Warn on class construction, rework pd.factorize warnings * FutureWarning -> DeprecationWarning * Remove old comment * backticks in warnings, revert datetimelike, avoid catch_warnings * fixup for warnings * mypy fixups * Move resolve_na_sentinel * Remove underscores Co-authored-by: Jeff Reback <[email protected]>
1 parent 6786ab2 commit d580826

File tree

15 files changed

+252
-41
lines changed

15 files changed

+252
-41
lines changed

doc/source/whatsnew/v1.5.0.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -725,8 +725,9 @@ Other Deprecations
725725
- Deprecated the ``closed`` argument in :class:`ArrowInterval` in favor of ``inclusive`` argument; In a future version passing ``closed`` will raise (:issue:`40245`)
726726
- Deprecated allowing ``unit="M"`` or ``unit="Y"`` in :class:`Timestamp` constructor with a non-round float value (:issue:`47267`)
727727
- Deprecated the ``display.column_space`` global configuration option (:issue:`7576`)
728+
- Deprecated the argument ``na_sentinel`` in :func:`factorize`, :meth:`Index.factorize`, and :meth:`.ExtensionArray.factorize`; pass ``use_na_sentinel=True`` instead to use the sentinel ``-1`` for NaN values and ``use_na_sentinel=False`` instead of ``na_sentinel=None`` to encode NaN values (:issue:`46910`)
728729
- Deprecated :meth:`DataFrameGroupBy.transform` not aligning the result when the UDF returned DataFrame (:issue:`45648`)
729-
-
730+
730731

731732
.. ---------------------------------------------------------------------------
732733
.. _whatsnew_150.performance:

pandas/core/algorithms.py

+90-12
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
"""
55
from __future__ import annotations
66

7+
import inspect
78
import operator
89
from textwrap import dedent
910
from typing import (
@@ -14,7 +15,7 @@
1415
cast,
1516
final,
1617
)
17-
from warnings import warn
18+
import warnings
1819

1920
import numpy as np
2021

@@ -586,7 +587,8 @@ def factorize_array(
586587
def factorize(
587588
values,
588589
sort: bool = False,
589-
na_sentinel: int | None = -1,
590+
na_sentinel: int | None | lib.NoDefault = lib.no_default,
591+
use_na_sentinel: bool | lib.NoDefault = lib.no_default,
590592
size_hint: int | None = None,
591593
) -> tuple[np.ndarray, np.ndarray | Index]:
592594
"""
@@ -604,7 +606,19 @@ def factorize(
604606
Value to mark "not found". If None, will not drop the NaN
605607
from the uniques of the values.
606608
609+
.. deprecated:: 1.5.0
610+
The na_sentinel argument is deprecated and
611+
will be removed in a future version of pandas. Specify use_na_sentinel as
612+
either True or False.
613+
607614
.. versionchanged:: 1.1.2
615+
616+
use_na_sentinel : bool, default True
617+
If True, the sentinel -1 will be used for NaN values. If False,
618+
NaN values will be encoded as non-negative integers and will not drop the
619+
NaN from the uniques of the values.
620+
621+
.. versionadded:: 1.5.0
608622
{size_hint}\
609623
610624
Returns
@@ -652,8 +666,8 @@ def factorize(
652666
>>> uniques
653667
array(['a', 'b', 'c'], dtype=object)
654668
655-
Missing values are indicated in `codes` with `na_sentinel`
656-
(``-1`` by default). Note that missing values are never
669+
When ``use_na_sentinel=True`` (the default), missing values are indicated in
670+
the `codes` with the sentinel value ``-1`` and missing values are not
657671
included in `uniques`.
658672
659673
>>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
@@ -688,16 +702,16 @@ def factorize(
688702
Index(['a', 'c'], dtype='object')
689703
690704
If NaN is in the values, and we want to include NaN in the uniques of the
691-
values, it can be achieved by setting ``na_sentinel=None``.
705+
values, it can be achieved by setting ``use_na_sentinel=False``.
692706
693707
>>> values = np.array([1, 2, 1, np.nan])
694-
>>> codes, uniques = pd.factorize(values) # default: na_sentinel=-1
708+
>>> codes, uniques = pd.factorize(values) # default: use_na_sentinel=True
695709
>>> codes
696710
array([ 0, 1, 0, -1])
697711
>>> uniques
698712
array([1., 2.])
699713
700-
>>> codes, uniques = pd.factorize(values, na_sentinel=None)
714+
>>> codes, uniques = pd.factorize(values, use_na_sentinel=False)
701715
>>> codes
702716
array([0, 1, 0, 2])
703717
>>> uniques
@@ -712,6 +726,7 @@ def factorize(
712726
# responsible only for factorization. All data coercion, sorting and boxing
713727
# should happen here.
714728

729+
na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel)
715730
if isinstance(values, ABCRangeIndex):
716731
return values.factorize(sort=sort)
717732

@@ -736,9 +751,22 @@ def factorize(
736751
codes, uniques = values.factorize(sort=sort)
737752
return _re_wrap_factorize(original, uniques, codes)
738753

739-
if not isinstance(values.dtype, np.dtype):
740-
# i.e. ExtensionDtype
741-
codes, uniques = values.factorize(na_sentinel=na_sentinel)
754+
elif not isinstance(values.dtype, np.dtype):
755+
if (
756+
na_sentinel == -1
757+
and "use_na_sentinel" in inspect.signature(values.factorize).parameters
758+
):
759+
# Avoid using catch_warnings when possible
760+
# GH#46910 - TimelikeOps has deprecated signature
761+
codes, uniques = values.factorize( # type: ignore[call-arg]
762+
use_na_sentinel=True
763+
)
764+
else:
765+
with warnings.catch_warnings():
766+
# We've already warned above
767+
warnings.filterwarnings("ignore", ".*use_na_sentinel.*", FutureWarning)
768+
codes, uniques = values.factorize(na_sentinel=na_sentinel)
769+
742770
else:
743771
values = np.asarray(values) # convert DTA/TDA/MultiIndex
744772
codes, uniques = factorize_array(
@@ -763,6 +791,56 @@ def factorize(
763791
return _re_wrap_factorize(original, uniques, codes)
764792

765793

794+
def resolve_na_sentinel(
795+
na_sentinel: int | None | lib.NoDefault,
796+
use_na_sentinel: bool | lib.NoDefault,
797+
) -> int | None:
798+
"""
799+
Determine value of na_sentinel for factorize methods.
800+
801+
See GH#46910 for details on the deprecation.
802+
803+
Parameters
804+
----------
805+
na_sentinel : int, None, or lib.no_default
806+
Value passed to the method.
807+
use_na_sentinel : bool or lib.no_default
808+
Value passed to the method.
809+
810+
Returns
811+
-------
812+
Resolved value of na_sentinel.
813+
"""
814+
if na_sentinel is not lib.no_default and use_na_sentinel is not lib.no_default:
815+
raise ValueError(
816+
"Cannot specify both `na_sentinel` and `use_na_sentile`; "
817+
f"got `na_sentinel={na_sentinel}` and `use_na_sentinel={use_na_sentinel}`"
818+
)
819+
if na_sentinel is lib.no_default:
820+
result = -1 if use_na_sentinel is lib.no_default or use_na_sentinel else None
821+
else:
822+
if na_sentinel is None:
823+
msg = (
824+
"Specifying `na_sentinel=None` is deprecated, specify "
825+
"`use_na_sentinel=False` instead."
826+
)
827+
elif na_sentinel == -1:
828+
msg = (
829+
"Specifying `na_sentinel=-1` is deprecated, specify "
830+
"`use_na_sentinel=True` instead."
831+
)
832+
else:
833+
msg = (
834+
"Specifying the specific value to use for `na_sentinel` is "
835+
"deprecated and will be removed in a future version of pandas. "
836+
"Specify `use_na_sentinel=True` to use the sentinel value -1, and "
837+
"`use_na_sentinel=False` to encode NaN values."
838+
)
839+
warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())
840+
result = na_sentinel
841+
return result
842+
843+
766844
def _re_wrap_factorize(original, uniques, codes: np.ndarray):
767845
"""
768846
Wrap factorize results in Series or Index depending on original type.
@@ -956,7 +1034,7 @@ def mode(
9561034
try:
9571035
npresult = np.sort(npresult)
9581036
except TypeError as err:
959-
warn(f"Unable to sort modes: {err}")
1037+
warnings.warn(f"Unable to sort modes: {err}")
9601038

9611039
result = _reconstruct_data(npresult, original.dtype, original)
9621040
return result
@@ -1576,7 +1654,7 @@ def diff(arr, n: int, axis: int = 0):
15761654
raise ValueError(f"cannot diff {type(arr).__name__} on axis={axis}")
15771655
return op(arr, arr.shift(n))
15781656
else:
1579-
warn(
1657+
warnings.warn(
15801658
"dtype lost in 'diff()'. In the future this will raise a "
15811659
"TypeError. Convert to a suitable dtype prior to calling 'diff'.",
15821660
FutureWarning,

pandas/core/arrays/arrow/array.py

+12-1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
import numpy as np
1010

11+
from pandas._libs import lib
1112
from pandas._typing import (
1213
Dtype,
1314
PositionalIndexer,
@@ -31,6 +32,7 @@
3132
)
3233
from pandas.core.dtypes.missing import isna
3334

35+
from pandas.core.algorithms import resolve_na_sentinel
3436
from pandas.core.arraylike import OpsMixin
3537
from pandas.core.arrays.base import ExtensionArray
3638
from pandas.core.indexers import (
@@ -286,7 +288,16 @@ def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
286288
return type(self)(pc.drop_null(self._data))
287289

288290
@doc(ExtensionArray.factorize)
289-
def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
291+
def factorize(
292+
self,
293+
na_sentinel: int | lib.NoDefault = lib.no_default,
294+
use_na_sentinel: bool | lib.NoDefault = lib.no_default,
295+
) -> tuple[np.ndarray, ExtensionArray]:
296+
resolved_na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel)
297+
if resolved_na_sentinel is None:
298+
raise NotImplementedError("Encoding NaN values is not yet implemented")
299+
else:
300+
na_sentinel = resolved_na_sentinel
290301
encoded = self._data.dictionary_encode()
291302
indices = pa.chunked_array(
292303
[c.indices for c in encoded.chunks], type=encoded.type.index_type

pandas/core/arrays/base.py

+44-1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
"""
99
from __future__ import annotations
1010

11+
import inspect
1112
import operator
1213
from typing import (
1314
TYPE_CHECKING,
@@ -20,6 +21,7 @@
2021
cast,
2122
overload,
2223
)
24+
import warnings
2325

2426
import numpy as np
2527

@@ -45,6 +47,7 @@
4547
cache_readonly,
4648
deprecate_nonkeyword_arguments,
4749
)
50+
from pandas.util._exceptions import find_stack_level
4851
from pandas.util._validators import (
4952
validate_bool_kwarg,
5053
validate_fillna_kwargs,
@@ -76,6 +79,7 @@
7679
isin,
7780
mode,
7881
rank,
82+
resolve_na_sentinel,
7983
unique,
8084
)
8185
from pandas.core.array_algos.quantile import quantile_with_mask
@@ -456,6 +460,24 @@ def __ne__(self, other: Any) -> ArrayLike: # type: ignore[override]
456460
"""
457461
return ~(self == other)
458462

463+
def __init_subclass__(cls, **kwargs):
464+
factorize = getattr(cls, "factorize")
465+
if (
466+
"use_na_sentinel" not in inspect.signature(factorize).parameters
467+
# TimelikeOps uses old factorize args to ensure we don't break things
468+
and cls.__name__ not in ("TimelikeOps", "DatetimeArray", "TimedeltaArray")
469+
):
470+
# See GH#46910 for details on the deprecation
471+
name = cls.__name__
472+
warnings.warn(
473+
f"The `na_sentinel` argument of `{name}.factorize` is deprecated. "
474+
f"In the future, pandas will use the `use_na_sentinel` argument "
475+
f"instead. Add this argument to `{name}.factorize` to be compatible "
476+
f"with future versions of pandas and silence this warning.",
477+
DeprecationWarning,
478+
stacklevel=find_stack_level(),
479+
)
480+
459481
def to_numpy(
460482
self,
461483
dtype: npt.DTypeLike | None = None,
@@ -1002,7 +1024,11 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]:
10021024
"""
10031025
return self.astype(object), np.nan
10041026

1005-
def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
1027+
def factorize(
1028+
self,
1029+
na_sentinel: int | lib.NoDefault = lib.no_default,
1030+
use_na_sentinel: bool | lib.NoDefault = lib.no_default,
1031+
) -> tuple[np.ndarray, ExtensionArray]:
10061032
"""
10071033
Encode the extension array as an enumerated type.
10081034
@@ -1011,6 +1037,18 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
10111037
na_sentinel : int, default -1
10121038
Value to use in the `codes` array to indicate missing values.
10131039
1040+
.. deprecated:: 1.5.0
1041+
The na_sentinel argument is deprecated and
1042+
will be removed in a future version of pandas. Specify use_na_sentinel
1043+
as either True or False.
1044+
1045+
use_na_sentinel : bool, default True
1046+
If True, the sentinel -1 will be used for NaN values. If False,
1047+
NaN values will be encoded as non-negative integers and will not drop the
1048+
NaN from the uniques of the values.
1049+
1050+
.. versionadded:: 1.5.0
1051+
10141052
Returns
10151053
-------
10161054
codes : ndarray
@@ -1041,6 +1079,11 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
10411079
# original ExtensionArray.
10421080
# 2. ExtensionArray.factorize.
10431081
# Complete control over factorization.
1082+
resolved_na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel)
1083+
if resolved_na_sentinel is None:
1084+
raise NotImplementedError("Encoding NaN values is not yet implemented")
1085+
else:
1086+
na_sentinel = resolved_na_sentinel
10441087
arr, na_value = self._values_for_factorize()
10451088

10461089
codes, uniques = factorize_array(

pandas/core/arrays/datetimelike.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -1996,7 +1996,12 @@ def _with_freq(self, freq):
19961996

19971997
# --------------------------------------------------------------
19981998

1999-
def factorize(self, na_sentinel=-1, sort: bool = False):
1999+
# GH#46910 - Keep old signature to test we don't break things for EA library authors
2000+
def factorize( # type:ignore[override]
2001+
self,
2002+
na_sentinel: int = -1,
2003+
sort: bool = False,
2004+
):
20002005
if self.freq is not None:
20012006
# We must be unique, so can short-circuit (and retain freq)
20022007
codes = np.arange(len(self), dtype=np.intp)

pandas/core/arrays/masked.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -869,7 +869,16 @@ def searchsorted(
869869
return self._data.searchsorted(value, side=side, sorter=sorter)
870870

871871
@doc(ExtensionArray.factorize)
872-
def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
872+
def factorize(
873+
self,
874+
na_sentinel: int | lib.NoDefault = lib.no_default,
875+
use_na_sentinel: bool | lib.NoDefault = lib.no_default,
876+
) -> tuple[np.ndarray, ExtensionArray]:
877+
resolved_na_sentinel = algos.resolve_na_sentinel(na_sentinel, use_na_sentinel)
878+
if resolved_na_sentinel is None:
879+
raise NotImplementedError("Encoding NaN values is not yet implemented")
880+
else:
881+
na_sentinel = resolved_na_sentinel
873882
arr = self._data
874883
mask = self._mask
875884

pandas/core/arrays/sparse/array.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -848,13 +848,19 @@ def _values_for_factorize(self):
848848
# Still override this for hash_pandas_object
849849
return np.asarray(self), self.fill_value
850850

851-
def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, SparseArray]:
851+
def factorize(
852+
self,
853+
na_sentinel: int | lib.NoDefault = lib.no_default,
854+
use_na_sentinel: bool | lib.NoDefault = lib.no_default,
855+
) -> tuple[np.ndarray, SparseArray]:
852856
# Currently, ExtensionArray.factorize -> Tuple[ndarray, EA]
853857
# The sparsity on this is backwards from what Sparse would want. Want
854858
# ExtensionArray.factorize -> Tuple[EA, EA]
855859
# Given that we have to return a dense array of codes, why bother
856860
# implementing an efficient factorize?
857-
codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel)
861+
codes, uniques = algos.factorize(
862+
np.asarray(self), na_sentinel=na_sentinel, use_na_sentinel=use_na_sentinel
863+
)
858864
uniques_sp = SparseArray(uniques, dtype=self.dtype)
859865
return codes, uniques_sp
860866

0 commit comments

Comments
 (0)