Backport PR #35852: API: replace dropna=False option with na_sentinel=None in factorize (#36071)

meeseeksmachine · charlesdong1991 · web-flow · commit 9bc223e025d8 · 2020-09-02T18:37:25.000+02:00
Co-authored-by: Kaiqi Dong &lt;kaiqi@kth.se&gt;
diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst
@@ -34,6 +34,14 @@ Bug fixes
 
 .. ---------------------------------------------------------------------------
 
+.. _whatsnew_112.other:
+
+Other
+~~~~~
+- :meth:`factorize` now supports ``na_sentinel=None`` to include NaN in the uniques of the values and remove ``dropna`` keyword which was unintentionally exposed to public facing API in 1.1 version from :meth:`factorize`(:issue:`35667`)
+
+.. ---------------------------------------------------------------------------
+
 .. _whatsnew_112.contributors:
 
 Contributors
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -525,9 +525,8 @@ def _factorize_array(
 def factorize(
     values,
     sort: bool = False,
-    na_sentinel: int = -1,
+    na_sentinel: Optional[int] = -1,
     size_hint: Optional[int] = None,
-    dropna: bool = True,
 ) -> Tuple[np.ndarray, Union[np.ndarray, ABCIndex]]:
     """
     Encode the object as an enumerated type or categorical variable.
@@ -540,8 +539,11 @@ def factorize(
     Parameters
     ----------
     {values}{sort}
-    na_sentinel : int, default -1
-        Value to mark "not found".
+    na_sentinel : int or None, default -1
+        Value to mark "not found". If None, will not drop the NaN
+        from the uniques of the values.
+
+        .. versionchanged:: 1.1.2
     {size_hint}\
 
     Returns
@@ -619,6 +621,22 @@ def factorize(
     array([0, 0, 1]...)
     >>> uniques
     Index(['a', 'c'], dtype='object')
+
+    If NaN is in the values, and we want to include NaN in the uniques of the
+    values, it can be achieved by setting ``na_sentinel=None``.
+
+    >>> values = np.array([1, 2, 1, np.nan])
+    >>> codes, uniques = pd.factorize(values)  # default: na_sentinel=-1
+    >>> codes
+    array([ 0,  1,  0, -1])
+    >>> uniques
+    array([1., 2.])
+
+    >>> codes, uniques = pd.factorize(values, na_sentinel=None)
+    >>> codes
+    array([0, 1, 0, 2])
+    >>> uniques
+    array([ 1.,  2., nan])
     """
     # Implementation notes: This method is responsible for 3 things
     # 1.) coercing data to array-like (ndarray, Index, extension array)
@@ -632,6 +650,13 @@ def factorize(
     values = _ensure_arraylike(values)
     original = values
 
+    # GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques
+    # of values, assign na_sentinel=-1 to replace code value for NaN.
+    dropna = True
+    if na_sentinel is None:
+        na_sentinel = -1
+        dropna = False
+
     if is_extension_array_dtype(values.dtype):
         values = extract_array(values)
         codes, uniques = values.factorize(na_sentinel=na_sentinel)
diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -1398,7 +1398,7 @@ def memory_usage(self, deep=False):
             """
         ),
     )
-    def factorize(self, sort=False, na_sentinel=-1):
+    def factorize(self, sort: bool = False, na_sentinel: Optional[int] = -1):
         return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel)
 
     _shared_docs[
diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
@@ -585,8 +585,13 @@ def _make_codes(self) -> None:
                 codes = self.grouper.codes_info
                 uniques = self.grouper.result_index
             else:
+                # GH35667, replace dropna=False with na_sentinel=None
+                if not self.dropna:
+                    na_sentinel = None
+                else:
+                    na_sentinel = -1
                 codes, uniques = algorithms.factorize(
-                    self.grouper, sort=self.sort, dropna=self.dropna
+                    self.grouper, sort=self.sort, na_sentinel=na_sentinel
                 )
                 uniques = Index(uniques, name=self.name)
             self._codes = codes
diff --git a/pandas/tests/base/test_factorize.py b/pandas/tests/base/test_factorize.py
@@ -26,3 +26,16 @@ def test_factorize(index_or_series_obj, sort):
 
     tm.assert_numpy_array_equal(result_codes, expected_codes)
     tm.assert_index_equal(result_uniques, expected_uniques)
+
+
+def test_series_factorize_na_sentinel_none():
+    # GH35667
+    values = np.array([1, 2, 1, np.nan])
+    ser = pd.Series(values)
+    codes, uniques = ser.factorize(na_sentinel=None)
+
+    expected_codes = np.array([0, 1, 0, 2], dtype="int64")
+    expected_uniques = pd.Index([1.0, 2.0, np.nan])
+
+    tm.assert_numpy_array_equal(codes, expected_codes)
+    tm.assert_index_equal(uniques, expected_uniques)
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -326,73 +326,47 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques):
             tm.assert_extension_array_equal(uniques, expected_uniques)
 
     @pytest.mark.parametrize(
-        "data, dropna, expected_codes, expected_uniques",
+        "data, expected_codes, expected_uniques",
         [
             (
                 ["a", None, "b", "a"],
-                True,
-                np.array([0, -1, 1, 0], dtype=np.dtype("intp")),
-                np.array(["a", "b"], dtype=object),
-            ),
-            (
-                ["a", np.nan, "b", "a"],
-                True,
-                np.array([0, -1, 1, 0], dtype=np.dtype("intp")),
-                np.array(["a", "b"], dtype=object),
-            ),
-            (
-                ["a", None, "b", "a"],
-                False,
                 np.array([0, 2, 1, 0], dtype=np.dtype("intp")),
                 np.array(["a", "b", np.nan], dtype=object),
             ),
             (
                 ["a", np.nan, "b", "a"],
-                False,
                 np.array([0, 2, 1, 0], dtype=np.dtype("intp")),
                 np.array(["a", "b", np.nan], dtype=object),
             ),
         ],
     )
-    def test_object_factorize_dropna(
-        self, data, dropna, expected_codes, expected_uniques
+    def test_object_factorize_na_sentinel_none(
+        self, data, expected_codes, expected_uniques
     ):
-        codes, uniques = algos.factorize(data, dropna=dropna)
+        codes, uniques = algos.factorize(data, na_sentinel=None)
 
         tm.assert_numpy_array_equal(uniques, expected_uniques)
         tm.assert_numpy_array_equal(codes, expected_codes)
 
     @pytest.mark.parametrize(
-        "data, dropna, expected_codes, expected_uniques",
+        "data, expected_codes, expected_uniques",
         [
             (
                 [1, None, 1, 2],
-                True,
-                np.array([0, -1, 0, 1], dtype=np.dtype("intp")),
-                np.array([1, 2], dtype="O"),
-            ),
-            (
-                [1, np.nan, 1, 2],
-                True,
-                np.array([0, -1, 0, 1], dtype=np.dtype("intp")),
-                np.array([1, 2], dtype=np.float64),
-            ),
-            (
-                [1, None, 1, 2],
-                False,
                 np.array([0, 2, 0, 1], dtype=np.dtype("intp")),
                 np.array([1, 2, np.nan], dtype="O"),
             ),
             (
                 [1, np.nan, 1, 2],
-                False,
                 np.array([0, 2, 0, 1], dtype=np.dtype("intp")),
                 np.array([1, 2, np.nan], dtype=np.float64),
             ),
         ],
     )
-    def test_int_factorize_dropna(self, data, dropna, expected_codes, expected_uniques):
-        codes, uniques = algos.factorize(data, dropna=dropna)
+    def test_int_factorize_na_sentinel_none(
+        self, data, expected_codes, expected_uniques
+    ):
+        codes, uniques = algos.factorize(data, na_sentinel=None)
 
         tm.assert_numpy_array_equal(uniques, expected_uniques)
         tm.assert_numpy_array_equal(codes, expected_codes)

Original file line number	Diff line number	Diff line change
`@@ -1398,7 +1398,7 @@ def memory_usage(self, deep=False):`
`1398`	`1398`	`"""`
`1399`	`1399`	`),`
`1400`	`1400`	`)`
`1401`		`- def factorize(self, sort=False, na_sentinel=-1):`
	`1401`	`+ def factorize(self, sort: bool = False, na_sentinel: Optional[int] = -1):`
`1402`	`1402`	`return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel)`
`1403`	`1403`
`1404`	`1404`	`_shared_docs[`