From 5dac4bda30ebca8cdf74bd5f28b5f004f755ad90 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Fri, 1 Apr 2022 10:14:36 -0400
Subject: [PATCH 1/4] CLN: Simplify factorize

---
 pandas/core/algorithms.py | 49 ++++++++-------------------------------
 1 file changed, 10 insertions(+), 39 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 6a04cbf4b5846..a818c3fabb9d7 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -772,10 +772,6 @@ def factorize(
     if not isinstance(values, ABCMultiIndex):
         values = extract_array(values, extract_numpy=True)
 
-    # GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques
-    # of values, assign na_sentinel=-1 to replace code value for NaN.
-    dropna = na_sentinel is not None
-
     if (
         isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray))
         and values.freq is not None
@@ -803,43 +799,17 @@ def factorize(
 
     else:
         values = np.asarray(values)  # convert DTA/TDA/MultiIndex
-        # TODO: pass na_sentinel=na_sentinel to factorize_array. When sort is True and
-        #       na_sentinel is None we append NA on the end because safe_sort does not
-        #       handle null values in uniques.
-        if na_sentinel is None and sort:
-            na_sentinel_arg = -1
-        elif na_sentinel is None:
-            na_sentinel_arg = None
-        else:
-            na_sentinel_arg = na_sentinel
         codes, uniques = factorize_array(
             values,
-            na_sentinel=na_sentinel_arg,
+            na_sentinel=na_sentinel,
             size_hint=size_hint,
         )
 
     if sort and len(uniques) > 0:
-        if na_sentinel is None:
-            # TODO: Can remove when na_sentinel=na_sentinel as in TODO above
-            na_sentinel = -1
         uniques, codes = safe_sort(
             uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False
         )
 
-    if not dropna and sort:
-        # TODO: Can remove entire block when na_sentinel=na_sentinel as in TODO above
-        if na_sentinel is None:
-            na_sentinel_arg = -1
-        else:
-            na_sentinel_arg = na_sentinel
-        code_is_na = codes == na_sentinel_arg
-        if code_is_na.any():
-            # na_value is set based on the dtype of uniques, and compat set to False is
-            # because we do not want na_value to be 0 for integers
-            na_value = na_value_for_dtype(uniques.dtype, compat=False)
-            uniques = np.append(uniques, [na_value])
-            codes = np.where(code_is_na, len(uniques) - 1, codes)
-
     uniques = _reconstruct_data(uniques, original.dtype, original)
 
     return _re_wrap_factorize(original, uniques, codes)
@@ -1918,24 +1888,25 @@ def safe_sort(
         # may deal with them here without performance loss using `mode='wrap'`
         new_codes = reverse_indexer.take(codes, mode="wrap")
 
-        mask = codes == na_sentinel
-        if verify:
-            mask = mask | (codes < -len(values)) | (codes >= len(values))
+        if na_sentinel is not None:
+            mask = codes == na_sentinel
+            if verify:
+                mask = mask | (codes < -len(values)) | (codes >= len(values))
 
-    if mask is not None:
+    if na_sentinel is not None and mask is not None:
         np.putmask(new_codes, mask, na_sentinel)
 
     return ordered, ensure_platform_int(new_codes)
 
 
 def _sort_mixed(values) -> np.ndarray:
-    """order ints before strings in 1d arrays, safe in py3"""
+    """order ints before strings before nulls in 1d arrays"""
     str_pos = np.array([isinstance(x, str) for x in values], dtype=bool)
-    none_pos = np.array([x is None for x in values], dtype=bool)
-    nums = np.sort(values[~str_pos & ~none_pos])
+    null_pos = np.array([isna(x) for x in values], dtype=bool)
+    nums = np.sort(values[~str_pos & ~null_pos])
     strs = np.sort(values[str_pos])
     return np.concatenate(
-        [nums, np.asarray(strs, dtype=object), np.array(values[none_pos])]
+        [nums, np.asarray(strs, dtype=object), np.array(values[null_pos])]
     )
 
 

From a11398906a569358c0b17eb5f79e8b9f7f7e746e Mon Sep 17 00:00:00 2001
From: richard <rhshadrach@gmail.com>
Date: Thu, 15 Sep 2022 22:47:18 -0400
Subject: [PATCH 2/4] Update type-hint and docstring

---
 pandas/core/algorithms.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index a818c3fabb9d7..b5c6b7a18eda3 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -1764,7 +1764,7 @@ def diff(arr, n: int, axis: int = 0):
 def safe_sort(
     values,
     codes=None,
-    na_sentinel: int = -1,
+    na_sentinel: int | None = -1,
     assume_unique: bool = False,
     verify: bool = True,
 ) -> np.ndarray | MultiIndex | tuple[np.ndarray | MultiIndex, np.ndarray]:
@@ -1781,8 +1781,8 @@ def safe_sort(
     codes : list_like, optional
         Indices to ``values``. All out of bound indices are treated as
         "not found" and will be masked with ``na_sentinel``.
-    na_sentinel : int, default -1
-        Value in ``codes`` to mark "not found".
+    na_sentinel : int or None, default -1
+        Value in ``codes`` to mark "not found", or None to encode null values as normal.
         Ignored when ``codes`` is None.
     assume_unique : bool, default False
         When True, ``values`` are assumed to be unique, which can speed up

From ccd61ece0c682dfaad89797e6adf940f4f2bc222 Mon Sep 17 00:00:00 2001
From: richard <rhshadrach@gmail.com>
Date: Fri, 16 Sep 2022 20:58:11 -0400
Subject: [PATCH 3/4] Update test

---
 pandas/tests/test_sorting.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py
index 537792ea8263c..ba1943878cfad 100644
--- a/pandas/tests/test_sorting.py
+++ b/pandas/tests/test_sorting.py
@@ -506,10 +506,10 @@ def test_extension_array_codes(self, verify, na_sentinel):
         tm.assert_numpy_array_equal(codes, expected_codes)
 
 
-def test_mixed_str_nan():
-    values = np.array(["b", np.nan, "a", "b"], dtype=object)
+def test_mixed_str_null(nulls_fixture):
+    values = np.array(["b", nulls_fixture, "a", "b"], dtype=object)
     result = safe_sort(values)
-    expected = np.array([np.nan, "a", "b", "b"], dtype=object)
+    expected = np.array(["a", "b", "b", nulls_fixture], dtype=object)
     tm.assert_numpy_array_equal(result, expected)
 
 

From f189a2fd063acdbafa6a508798b5bfd96abf7854 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Fri, 30 Sep 2022 08:18:07 -0400
Subject: [PATCH 4/4] cleanup

---
 pandas/tests/groupby/test_groupby_dropna.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
index de3cc3fbdc6a4..360e3096ceb63 100644
--- a/pandas/tests/groupby/test_groupby_dropna.py
+++ b/pandas/tests/groupby/test_groupby_dropna.py
@@ -93,9 +93,6 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups(
         mi = mi.set_levels([["A", "B", np.nan], ["A", "B", np.nan]])
     expected = pd.DataFrame(outputs, index=mi)
 
-    print(grouped)
-    print(expected)
-
     tm.assert_frame_equal(grouped, expected)