Skip to content

Commit 9bc223e

Browse files
Backport PR #35852: API: replace dropna=False option with na_sentinel=None in factorize (#36071)
Co-authored-by: Kaiqi Dong <[email protected]>
1 parent f9fed6a commit 9bc223e

File tree

6 files changed

+66
-41
lines changed

6 files changed

+66
-41
lines changed

doc/source/whatsnew/v1.1.2.rst

+8
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,14 @@ Bug fixes
3434

3535
.. ---------------------------------------------------------------------------
3636
37+
.. _whatsnew_112.other:
38+
39+
Other
40+
~~~~~
41+
- :meth:`factorize` now supports ``na_sentinel=None`` to include NaN in the uniques of the values and remove ``dropna`` keyword which was unintentionally exposed to public facing API in 1.1 version from :meth:`factorize`(:issue:`35667`)
42+
43+
.. ---------------------------------------------------------------------------
44+
3745
.. _whatsnew_112.contributors:
3846

3947
Contributors

pandas/core/algorithms.py

+29-4
Original file line numberDiff line numberDiff line change
@@ -525,9 +525,8 @@ def _factorize_array(
525525
def factorize(
526526
values,
527527
sort: bool = False,
528-
na_sentinel: int = -1,
528+
na_sentinel: Optional[int] = -1,
529529
size_hint: Optional[int] = None,
530-
dropna: bool = True,
531530
) -> Tuple[np.ndarray, Union[np.ndarray, ABCIndex]]:
532531
"""
533532
Encode the object as an enumerated type or categorical variable.
@@ -540,8 +539,11 @@ def factorize(
540539
Parameters
541540
----------
542541
{values}{sort}
543-
na_sentinel : int, default -1
544-
Value to mark "not found".
542+
na_sentinel : int or None, default -1
543+
Value to mark "not found". If None, will not drop the NaN
544+
from the uniques of the values.
545+
546+
.. versionchanged:: 1.1.2
545547
{size_hint}\
546548
547549
Returns
@@ -619,6 +621,22 @@ def factorize(
619621
array([0, 0, 1]...)
620622
>>> uniques
621623
Index(['a', 'c'], dtype='object')
624+
625+
If NaN is in the values, and we want to include NaN in the uniques of the
626+
values, it can be achieved by setting ``na_sentinel=None``.
627+
628+
>>> values = np.array([1, 2, 1, np.nan])
629+
>>> codes, uniques = pd.factorize(values) # default: na_sentinel=-1
630+
>>> codes
631+
array([ 0, 1, 0, -1])
632+
>>> uniques
633+
array([1., 2.])
634+
635+
>>> codes, uniques = pd.factorize(values, na_sentinel=None)
636+
>>> codes
637+
array([0, 1, 0, 2])
638+
>>> uniques
639+
array([ 1., 2., nan])
622640
"""
623641
# Implementation notes: This method is responsible for 3 things
624642
# 1.) coercing data to array-like (ndarray, Index, extension array)
@@ -632,6 +650,13 @@ def factorize(
632650
values = _ensure_arraylike(values)
633651
original = values
634652

653+
# GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques
654+
# of values, assign na_sentinel=-1 to replace code value for NaN.
655+
dropna = True
656+
if na_sentinel is None:
657+
na_sentinel = -1
658+
dropna = False
659+
635660
if is_extension_array_dtype(values.dtype):
636661
values = extract_array(values)
637662
codes, uniques = values.factorize(na_sentinel=na_sentinel)

pandas/core/base.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1398,7 +1398,7 @@ def memory_usage(self, deep=False):
13981398
"""
13991399
),
14001400
)
1401-
def factorize(self, sort=False, na_sentinel=-1):
1401+
def factorize(self, sort: bool = False, na_sentinel: Optional[int] = -1):
14021402
return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel)
14031403

14041404
_shared_docs[

pandas/core/groupby/grouper.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -585,8 +585,13 @@ def _make_codes(self) -> None:
585585
codes = self.grouper.codes_info
586586
uniques = self.grouper.result_index
587587
else:
588+
# GH35667, replace dropna=False with na_sentinel=None
589+
if not self.dropna:
590+
na_sentinel = None
591+
else:
592+
na_sentinel = -1
588593
codes, uniques = algorithms.factorize(
589-
self.grouper, sort=self.sort, dropna=self.dropna
594+
self.grouper, sort=self.sort, na_sentinel=na_sentinel
590595
)
591596
uniques = Index(uniques, name=self.name)
592597
self._codes = codes

pandas/tests/base/test_factorize.py

+13
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,16 @@ def test_factorize(index_or_series_obj, sort):
2626

2727
tm.assert_numpy_array_equal(result_codes, expected_codes)
2828
tm.assert_index_equal(result_uniques, expected_uniques)
29+
30+
31+
def test_series_factorize_na_sentinel_none():
32+
# GH35667
33+
values = np.array([1, 2, 1, np.nan])
34+
ser = pd.Series(values)
35+
codes, uniques = ser.factorize(na_sentinel=None)
36+
37+
expected_codes = np.array([0, 1, 0, 2], dtype="int64")
38+
expected_uniques = pd.Index([1.0, 2.0, np.nan])
39+
40+
tm.assert_numpy_array_equal(codes, expected_codes)
41+
tm.assert_index_equal(uniques, expected_uniques)

pandas/tests/test_algos.py

+9-35
Original file line numberDiff line numberDiff line change
@@ -326,73 +326,47 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques):
326326
tm.assert_extension_array_equal(uniques, expected_uniques)
327327

328328
@pytest.mark.parametrize(
329-
"data, dropna, expected_codes, expected_uniques",
329+
"data, expected_codes, expected_uniques",
330330
[
331331
(
332332
["a", None, "b", "a"],
333-
True,
334-
np.array([0, -1, 1, 0], dtype=np.dtype("intp")),
335-
np.array(["a", "b"], dtype=object),
336-
),
337-
(
338-
["a", np.nan, "b", "a"],
339-
True,
340-
np.array([0, -1, 1, 0], dtype=np.dtype("intp")),
341-
np.array(["a", "b"], dtype=object),
342-
),
343-
(
344-
["a", None, "b", "a"],
345-
False,
346333
np.array([0, 2, 1, 0], dtype=np.dtype("intp")),
347334
np.array(["a", "b", np.nan], dtype=object),
348335
),
349336
(
350337
["a", np.nan, "b", "a"],
351-
False,
352338
np.array([0, 2, 1, 0], dtype=np.dtype("intp")),
353339
np.array(["a", "b", np.nan], dtype=object),
354340
),
355341
],
356342
)
357-
def test_object_factorize_dropna(
358-
self, data, dropna, expected_codes, expected_uniques
343+
def test_object_factorize_na_sentinel_none(
344+
self, data, expected_codes, expected_uniques
359345
):
360-
codes, uniques = algos.factorize(data, dropna=dropna)
346+
codes, uniques = algos.factorize(data, na_sentinel=None)
361347

362348
tm.assert_numpy_array_equal(uniques, expected_uniques)
363349
tm.assert_numpy_array_equal(codes, expected_codes)
364350

365351
@pytest.mark.parametrize(
366-
"data, dropna, expected_codes, expected_uniques",
352+
"data, expected_codes, expected_uniques",
367353
[
368354
(
369355
[1, None, 1, 2],
370-
True,
371-
np.array([0, -1, 0, 1], dtype=np.dtype("intp")),
372-
np.array([1, 2], dtype="O"),
373-
),
374-
(
375-
[1, np.nan, 1, 2],
376-
True,
377-
np.array([0, -1, 0, 1], dtype=np.dtype("intp")),
378-
np.array([1, 2], dtype=np.float64),
379-
),
380-
(
381-
[1, None, 1, 2],
382-
False,
383356
np.array([0, 2, 0, 1], dtype=np.dtype("intp")),
384357
np.array([1, 2, np.nan], dtype="O"),
385358
),
386359
(
387360
[1, np.nan, 1, 2],
388-
False,
389361
np.array([0, 2, 0, 1], dtype=np.dtype("intp")),
390362
np.array([1, 2, np.nan], dtype=np.float64),
391363
),
392364
],
393365
)
394-
def test_int_factorize_dropna(self, data, dropna, expected_codes, expected_uniques):
395-
codes, uniques = algos.factorize(data, dropna=dropna)
366+
def test_int_factorize_na_sentinel_none(
367+
self, data, expected_codes, expected_uniques
368+
):
369+
codes, uniques = algos.factorize(data, na_sentinel=None)
396370

397371
tm.assert_numpy_array_equal(uniques, expected_uniques)
398372
tm.assert_numpy_array_equal(codes, expected_codes)

0 commit comments

Comments
 (0)