From 8e102f235abb83083496e1243a2c583411ae5f21 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Fri, 1 Apr 2022 10:14:36 -0400
Subject: [PATCH] BUG: GroupBy equates different types of null values

---
 pandas/core/algorithms.py                   | 54 +++-----------
 pandas/tests/groupby/test_groupby_dropna.py | 82 +++++++++++++++++----
 2 files changed, 79 insertions(+), 57 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 77de1636a92eb..643a3ee05cc1a 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -78,10 +78,7 @@
     ABCSeries,
     ABCTimedeltaArray,
 )
-from pandas.core.dtypes.missing import (
-    isna,
-    na_value_for_dtype,
-)
+from pandas.core.dtypes.missing import isna
 
 from pandas.core.array_algos.take import take_nd
 from pandas.core.construction import (
@@ -761,10 +758,6 @@ def factorize(
     if not isinstance(values, ABCMultiIndex):
         values = extract_array(values, extract_numpy=True)
 
-    # GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques
-    # of values, assign na_sentinel=-1 to replace code value for NaN.
-    dropna = na_sentinel is not None
-
     if (
         isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray))
         and values.freq is not None
@@ -792,43 +785,17 @@ def factorize(
 
     else:
         values = np.asarray(values)  # convert DTA/TDA/MultiIndex
-        # TODO: pass na_sentinel=na_sentinel to factorize_array. When sort is True and
-        #       na_sentinel is None we append NA on the end because safe_sort does not
-        #       handle null values in uniques.
-        if na_sentinel is None and sort:
-            na_sentinel_arg = -1
-        elif na_sentinel is None:
-            na_sentinel_arg = None
-        else:
-            na_sentinel_arg = na_sentinel
         codes, uniques = factorize_array(
             values,
-            na_sentinel=na_sentinel_arg,
+            na_sentinel=na_sentinel,
             size_hint=size_hint,
         )
 
     if sort and len(uniques) > 0:
-        if na_sentinel is None:
-            # TODO: Can remove when na_sentinel=na_sentinel as in TODO above
-            na_sentinel = -1
         uniques, codes = safe_sort(
             uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False
         )
 
-    if not dropna and sort:
-        # TODO: Can remove entire block when na_sentinel=na_sentinel as in TODO above
-        if na_sentinel is None:
-            na_sentinel_arg = -1
-        else:
-            na_sentinel_arg = na_sentinel
-        code_is_na = codes == na_sentinel_arg
-        if code_is_na.any():
-            # na_value is set based on the dtype of uniques, and compat set to False is
-            # because we do not want na_value to be 0 for integers
-            na_value = na_value_for_dtype(uniques.dtype, compat=False)
-            uniques = np.append(uniques, [na_value])
-            codes = np.where(code_is_na, len(uniques) - 1, codes)
-
     uniques = _reconstruct_data(uniques, original.dtype, original)
 
     return _re_wrap_factorize(original, uniques, codes)
@@ -1898,24 +1865,25 @@ def safe_sort(
         # may deal with them here without performance loss using `mode='wrap'`
         new_codes = reverse_indexer.take(codes, mode="wrap")
 
-        mask = codes == na_sentinel
-        if verify:
-            mask = mask | (codes < -len(values)) | (codes >= len(values))
+        if na_sentinel is not None:
+            mask = codes == na_sentinel
+            if verify:
+                mask = mask | (codes < -len(values)) | (codes >= len(values))
 
-    if mask is not None:
+    if na_sentinel is not None and mask is not None:
         np.putmask(new_codes, mask, na_sentinel)
 
     return ordered, ensure_platform_int(new_codes)
 
 
 def _sort_mixed(values) -> np.ndarray:
-    """order ints before strings in 1d arrays, safe in py3"""
+    """order ints before strings before nulls in 1d arrays"""
     str_pos = np.array([isinstance(x, str) for x in values], dtype=bool)
-    none_pos = np.array([x is None for x in values], dtype=bool)
-    nums = np.sort(values[~str_pos & ~none_pos])
+    null_pos = np.array([isna(x) for x in values], dtype=bool)
+    nums = np.sort(values[~str_pos & ~null_pos])
     strs = np.sort(values[str_pos])
     return np.concatenate(
-        [nums, np.asarray(strs, dtype=object), np.array(values[none_pos])]
+        [nums, np.asarray(strs, dtype=object), np.array(values[null_pos])]
     )
 
 
diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
index 394b5adcf0370..36e022b75d553 100644
--- a/pandas/tests/groupby/test_groupby_dropna.py
+++ b/pandas/tests/groupby/test_groupby_dropna.py
@@ -1,3 +1,5 @@
+import copy
+
 import numpy as np
 import pytest
 
@@ -60,18 +62,23 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_one_group(
         ),
         (
             False,
-            [["A", "B"], ["A", np.nan], ["B", "A"], [np.nan, "B"]],
+            # "null1" and "null2" are placeholders for fixtures
+            [["A", "B"], ["A", "null1"], ["A", "null2"], ["B", "A"], ["null1", "B"]],
             {
-                "c": [12.0, 13.3, 123.23, 1.0],
-                "d": [12.0, 234.0, 123.0, 1.0],
-                "e": [12.0, 13.0, 1.0, 1.0],
+                "c": [12.0, 12.3, 1.0, 123.23, 1.0],
+                "d": [12.0, 233.0, 1.0, 123.0, 1.0],
+                "e": [12.0, 12.0, 1.0, 1.0, 1.0],
             },
         ),
     ],
 )
-def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups(
+def test_groupby_dropna_multi_index_dataframe_different_nan_in_two_groups(
     dropna, tuples, outputs, nulls_fixture, nulls_fixture2
 ):
+    if not dropna and type(nulls_fixture).__name__ == type(nulls_fixture2).__name__:
+        pytest.skip(
+            "tested in test_groupby_dropna_multi_index_dataframe_same_nan_in_two_groups"
+        )
     # GH 3729 this is to test that NA in different groups with different representations
     df_list = [
         ["A", "B", 12, 12, 12],
@@ -83,6 +90,60 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups(
     df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
     grouped = df.groupby(["a", "b"], dropna=dropna).sum()
 
+    if not dropna:
+        tuples = copy.deepcopy(tuples)
+        tuples[1][1] = nulls_fixture
+        tuples[2][1] = nulls_fixture2
+        tuples[4][0] = nulls_fixture
+    mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))
+
+    # Since right now, by default MI will drop NA from levels when we create MI
+    # via `from_*`, so we need to add NA for level manually afterwards.
+    if not dropna:
+        mi = mi.set_levels([["A", "B", np.nan], ["A", "B", np.nan]])
+    expected = pd.DataFrame(outputs, index=mi)
+
+    tm.assert_frame_equal(grouped, expected)
+
+
+@pytest.mark.parametrize(
+    "dropna, tuples, outputs",
+    [
+        (
+            True,
+            [["A", "B"], ["B", "A"]],
+            {"c": [12.0, 123.23], "d": [12.0, 123.0], "e": [12.0, 1.0]},
+        ),
+        (
+            False,
+            # "null" is a placeholders for fixtures
+            [["A", "B"], ["A", "null"], ["B", "A"], ["null", "B"]],
+            {
+                "c": [12.0, 13.3, 123.23, 1.0],
+                "d": [12.0, 234.0, 123.0, 1.0],
+                "e": [12.0, 13.0, 1.0, 1.0],
+            },
+        ),
+    ],
+)
+def test_groupby_dropna_multi_index_dataframe_same_nan_in_two_groups(
+    dropna, tuples, outputs, nulls_fixture
+):
+    # GH 3729 this is to test that NA in different groups with different representations
+    df_list = [
+        ["A", "B", 12, 12, 12],
+        ["A", nulls_fixture, 12.3, 233.0, 12],
+        ["B", "A", 123.23, 123, 1],
+        [nulls_fixture, "B", 1, 1, 1.0],
+        ["A", nulls_fixture, 1, 1, 1.0],
+    ]
+    df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
+    grouped = df.groupby(["a", "b"], dropna=dropna).sum()
+
+    if not dropna:
+        tuples = copy.deepcopy(tuples)
+        tuples[1][1] = nulls_fixture
+        tuples[3][0] = nulls_fixture
     mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))
 
     # Since right now, by default MI will drop NA from levels when we create MI
@@ -439,15 +500,8 @@ def test_no_sort_keep_na(values, dtype, test_series):
     gb = df.groupby("key", dropna=False, sort=False)
     if test_series:
         gb = gb["a"]
-
-    warn = None
-    if isinstance(values, pd.arrays.SparseArray):
-        warn = FutureWarning
-    msg = "passing a SparseArray to pd.Index will store that array directly"
-    with tm.assert_produces_warning(warn, match=msg):
-        result = gb.sum()
-        expected = pd.DataFrame({"a": [5, 2, 3]}, index=key[:-1].rename("key"))
-
+    result = gb.sum()
+    expected = pd.DataFrame({"a": [5, 2, 3]}, index=key[:-1].rename("key"))
     if test_series:
         expected = expected["a"]
     if expected.index.is_categorical():