-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
DEPR: na_sentinel in factorize #47157
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 14 commits
552775f
79231e7
c822282
05fa0ca
f626dd8
a15e43a
9a33637
46e7a8d
b1edc89
c8d6fa2
0fd1ea7
465ab2b
6b4917c
d8e3d6b
0111eef
39b3747
5842053
945bb04
8f8cb18
9630edc
5524d53
fcd8a75
c615de4
80aaf5e
dc97a7b
eb6d5a1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,7 +14,7 @@ | |
cast, | ||
final, | ||
) | ||
from warnings import warn | ||
import warnings | ||
|
||
import numpy as np | ||
|
||
|
@@ -80,6 +80,7 @@ | |
na_value_for_dtype, | ||
) | ||
|
||
from pandas.core import common as com | ||
from pandas.core.array_algos.take import take_nd | ||
from pandas.core.construction import ( | ||
array as pd_array, | ||
|
@@ -580,7 +581,8 @@ def factorize_array( | |
def factorize( | ||
values, | ||
sort: bool = False, | ||
na_sentinel: int | None = -1, | ||
na_sentinel: int | None | lib.NoDefault = lib.no_default, | ||
use_na_sentinel: bool | lib.NoDefault = lib.no_default, | ||
size_hint: int | None = None, | ||
) -> tuple[np.ndarray, np.ndarray | Index]: | ||
""" | ||
|
@@ -598,7 +600,19 @@ def factorize( | |
Value to mark "not found". If None, will not drop the NaN | ||
from the uniques of the values. | ||
|
||
.. deprecated:: 1.5.0 | ||
The na_sentinel argument is deprecated and | ||
will be removed in a future version of pandas. Specify use_na_sentinel as | ||
either True or False. | ||
|
||
.. versionchanged:: 1.1.2 | ||
|
||
use_na_sentinel : bool, default True | ||
If True, the sentinel -1 will be used for NaN values. If False, | ||
NaN values will be encoded as non-negative integers and will not drop the | ||
NaN from the uniques of the values. | ||
|
||
.. versionadded:: 1.5.0 | ||
{size_hint}\ | ||
|
||
Returns | ||
|
@@ -646,8 +660,8 @@ def factorize( | |
>>> uniques | ||
array(['a', 'b', 'c'], dtype=object) | ||
|
||
Missing values are indicated in `codes` with `na_sentinel` | ||
(``-1`` by default). Note that missing values are never | ||
When ``use_na_sentinel=True`` (the default), missing values are indicated in | ||
the `codes` with the sentinel value ``-1`` and missing values are not | ||
included in `uniques`. | ||
|
||
>>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b']) | ||
|
@@ -682,16 +696,16 @@ def factorize( | |
Index(['a', 'c'], dtype='object') | ||
|
||
If NaN is in the values, and we want to include NaN in the uniques of the | ||
values, it can be achieved by setting ``na_sentinel=None``. | ||
values, it can be achieved by setting ``use_na_sentinel=False``. | ||
|
||
>>> values = np.array([1, 2, 1, np.nan]) | ||
>>> codes, uniques = pd.factorize(values) # default: na_sentinel=-1 | ||
>>> codes, uniques = pd.factorize(values) # default: use_na_sentinel=True | ||
>>> codes | ||
array([ 0, 1, 0, -1]) | ||
>>> uniques | ||
array([1., 2.]) | ||
|
||
>>> codes, uniques = pd.factorize(values, na_sentinel=None) | ||
>>> codes, uniques = pd.factorize(values, use_na_sentinel=False) | ||
>>> codes | ||
array([0, 1, 0, 2]) | ||
>>> uniques | ||
|
@@ -706,6 +720,7 @@ def factorize( | |
# responsible only for factorization. All data coercion, sorting and boxing | ||
# should happen here. | ||
|
||
na_sentinel = com.resolve_na_sentinel(na_sentinel, use_na_sentinel) | ||
if isinstance(values, ABCRangeIndex): | ||
return values.factorize(sort=sort) | ||
|
||
|
@@ -730,9 +745,12 @@ def factorize( | |
codes, uniques = values.factorize(sort=sort) | ||
return _re_wrap_factorize(original, uniques, codes) | ||
|
||
if not isinstance(values.dtype, np.dtype): | ||
# i.e. ExtensionDtype | ||
codes, uniques = values.factorize(na_sentinel=na_sentinel) | ||
elif not isinstance(values.dtype, np.dtype): | ||
with warnings.catch_warnings(): | ||
# We've already warned above | ||
warnings.filterwarnings("ignore", ".*use_na_sentinel.*", FutureWarning) | ||
codes, uniques = values.factorize(na_sentinel=na_sentinel) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we do a similar "if use_na_sentinel in signature" check (as in init_subclass) and in that case already pass the new keyword to the underlying EA? (in that case we don't have to catch any warning) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's unclear to me as we'd be trading out catch_warnings for inspect and more complex logic (though not by much). I thought inspect would take somewhat longer (on the order of 1ms) but on my machine it's pretty quick:
I've gone an implemented it; easy to revert if we don't like it. |
||
|
||
else: | ||
values = np.asarray(values) # convert DTA/TDA/MultiIndex | ||
codes, uniques = factorize_array( | ||
|
@@ -950,7 +968,7 @@ def mode( | |
try: | ||
npresult = np.sort(npresult) | ||
except TypeError as err: | ||
warn(f"Unable to sort modes: {err}") | ||
warnings.warn(f"Unable to sort modes: {err}") | ||
|
||
result = _reconstruct_data(npresult, original.dtype, original) | ||
return result | ||
|
@@ -1564,7 +1582,7 @@ def diff(arr, n: int, axis: int = 0): | |
raise ValueError(f"cannot diff {type(arr).__name__} on axis={axis}") | ||
return op(arr, arr.shift(n)) | ||
else: | ||
warn( | ||
warnings.warn( | ||
"dtype lost in 'diff()'. In the future this will raise a " | ||
"TypeError. Convert to a suitable dtype prior to calling 'diff'.", | ||
FutureWarning, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I took this to mean use backticks for any argument / code in the warnings. I went and implemented that for all warnings.