Backport PR #48539 on branch 1.5.x (REGR: groupby doesn't identify null values when sort=False) (#48568)

meeseeksmachine · rhshadrach · web-flow · commit 19a3f5ae50cb · 2022-09-15T11:53:20.000-07:00
Backport PR #48539: REGR: groupby doesn't identify null values when sort=False Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com>
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -1016,7 +1016,6 @@ Numeric
 - Bug in division, ``pow`` and ``mod`` operations on array-likes with ``dtype="boolean"`` not being like their ``np.bool_`` counterparts (:issue:`46063`)
 - Bug in multiplying a :class:`Series` with ``IntegerDtype`` or ``FloatingDtype`` by an array-like with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`45622`)
 - Bug in :meth:`mean` where the optional dependency ``bottleneck`` causes precision loss linear in the length of the array. ``bottleneck`` has been disabled for :meth:`mean` improving the loss to log-linear but may result in a performance decrease. (:issue:`42878`)
-- Bug in :func:`factorize` would convert the value ``None`` to ``np.nan`` (:issue:`46601`)
 
 Conversion
 ^^^^^^^^^^
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -566,6 +566,17 @@ def factorize_array(
 
     hash_klass, values = _get_hashtable_algo(values)
 
+    # factorize can now handle differentiating various types of null values.
+    # However, for backwards compatibility we only use the null for the
+    # provided dtype. This may be revisited in the future, see GH#48476.
+    null_mask = isna(values)
+    if null_mask.any():
+        na_value = na_value_for_dtype(values.dtype, compat=False)
+        # Don't modify (potentially user-provided) array
+        # error: No overload variant of "where" matches argument types "Any", "object",
+        # "ndarray[Any, Any]"
+        values = np.where(null_mask, na_value, values)  # type: ignore[call-overload]
+
     table = hash_klass(size_hint or len(values))
     uniques, codes = table.factorize(
         values,
diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
@@ -3,6 +3,8 @@
 
 from pandas.compat.pyarrow import pa_version_under1p01
 
+from pandas.core.dtypes.missing import na_value_for_dtype
+
 import pandas as pd
 import pandas._testing as tm
 
@@ -422,7 +424,7 @@ def test_groupby_drop_nan_with_multi_index():
         (
             [
                 pd.Period("2012-02-01", freq="D"),
-                pd.NA,
+                pd.NaT,
                 pd.Period("2012-01-01", freq="D"),
                 pd.Period("2012-02-01", freq="D"),
             ],
@@ -454,3 +456,22 @@ def test_no_sort_keep_na(values, dtype, test_series):
         # TODO: Slicing reorders categories?
         expected.index = expected.index.reorder_categories(["y", "x"])
     tm.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize("test_series", [True, False])
+@pytest.mark.parametrize("dtype", [object, None])
+def test_null_is_null_for_dtype(
+    sort, dtype, nulls_fixture, nulls_fixture2, test_series
+):
+    # GH#48506 - groups should always result in using the null for the dtype
+    df = pd.DataFrame({"a": [1, 2]})
+    groups = pd.Series([nulls_fixture, nulls_fixture2], dtype=dtype)
+    obj = df["a"] if test_series else df
+    gb = obj.groupby(groups, dropna=False, sort=sort)
+    result = gb.sum()
+    index = pd.Index([na_value_for_dtype(groups.dtype)])
+    expected = pd.DataFrame({"a": [3]}, index=index)
+    if test_series:
+        tm.assert_series_equal(result, expected["a"])
+    else:
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -468,7 +468,7 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques):
             (
                 ["a", None, "b", "a"],
                 np.array([0, 1, 2, 0], dtype=np.dtype("intp")),
-                np.array(["a", None, "b"], dtype=object),
+                np.array(["a", np.nan, "b"], dtype=object),
             ),
             (
                 ["a", np.nan, "b", "a"],
@@ -482,16 +482,16 @@ def test_object_factorize_use_na_sentinel_false(
     ):
         codes, uniques = algos.factorize(data, use_na_sentinel=False)
 
-        tm.assert_numpy_array_equal(uniques, expected_uniques)
-        tm.assert_numpy_array_equal(codes, expected_codes)
+        tm.assert_numpy_array_equal(uniques, expected_uniques, strict_nan=True)
+        tm.assert_numpy_array_equal(codes, expected_codes, strict_nan=True)
 
     @pytest.mark.parametrize(
         "data, expected_codes, expected_uniques",
         [
             (
                 [1, None, 1, 2],
                 np.array([0, 1, 0, 2], dtype=np.dtype("intp")),
-                np.array([1, None, 2], dtype="O"),
+                np.array([1, np.nan, 2], dtype="O"),
             ),
             (
                 [1, np.nan, 1, 2],
@@ -505,8 +505,8 @@ def test_int_factorize_use_na_sentinel_false(
     ):
         codes, uniques = algos.factorize(data, use_na_sentinel=False)
 
-        tm.assert_numpy_array_equal(uniques, expected_uniques)
-        tm.assert_numpy_array_equal(codes, expected_codes)
+        tm.assert_numpy_array_equal(uniques, expected_uniques, strict_nan=True)
+        tm.assert_numpy_array_equal(codes, expected_codes, strict_nan=True)
 
 
 class TestUnique: