From a1c48bcb86f4b1e98c87d9c51f461cede70c3cee Mon Sep 17 00:00:00 2001
From: richard <rhshadrach@gmail.com>
Date: Tue, 27 Sep 2022 16:19:42 -0400
Subject: [PATCH 1/4] REGR: groupby fails with nullable dtypes and dropna=False

---
 doc/source/whatsnew/v1.5.1.rst              |   1 +
 pandas/core/arrays/arrow/array.py           |   4 +-
 pandas/core/arrays/masked.py                |   2 +-
 pandas/tests/groupby/test_groupby_dropna.py | 113 +++++++++++---------
 4 files changed, 66 insertions(+), 54 deletions(-)

diff --git a/doc/source/whatsnew/v1.5.1.rst b/doc/source/whatsnew/v1.5.1.rst
index ee66f2f649bc2..77622b25a9404 100644
--- a/doc/source/whatsnew/v1.5.1.rst
+++ b/doc/source/whatsnew/v1.5.1.rst
@@ -78,6 +78,7 @@ Fixed regressions
 - Fixed regression causing an ``AttributeError`` during warning emitted if the provided table name in :meth:`DataFrame.to_sql` and the table name actually used in the database do not match (:issue:`48733`)
 - Fixed :meth:`.DataFrameGroupBy.size` not returning a Series when ``axis=1`` (:issue:`48738`)
 - Fixed Regression in :meth:`DataFrameGroupBy.apply` when user defined function is called on an empty dataframe (:issue:`47985`)
+- Fixed regression in :meth:`Series.groupby` and :meth:`DataFrame.groupby` when the grouper is a nullable data type (e.g. :class:`Int64`) or a PyArrow-backed string array, contains null values, and ``dropna=False`` (:issue:`48794`)
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 03f734730a0f9..d0c64ec522eef 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -619,8 +619,8 @@ def factorize(
                 na_mask = indices.values == -1
                 na_index = na_mask.argmax()
                 if na_mask[na_index]:
-                    uniques = uniques.insert(na_index, self.dtype.na_value)
-                    na_code = 0 if na_index == 0 else indices[:na_index].argmax() + 1
+                    na_code = 0 if na_index == 0 else indices[:na_index].max() + 1
+                    uniques = uniques.insert(na_code, self.dtype.na_value)
                     indices[indices >= na_code] += 1
                     indices[indices == -1] = na_code
         else:
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
index 043e0baf3ec0e..34ca205f7709a 100644
--- a/pandas/core/arrays/masked.py
+++ b/pandas/core/arrays/masked.py
@@ -913,7 +913,7 @@ def factorize(
             else:
                 # mypy error: Slice index must be an integer or None
                 # https://github.com/python/mypy/issues/2410
-                na_code = codes[:na_index].argmax() + 1  # type: ignore[misc]
+                na_code = codes[:na_index].max() + 1  # type: ignore[misc]
             codes[codes >= na_code] += 1
             codes[codes == -1] = na_code
             # dummy value for uniques; not used since uniques_mask will be True
diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
index 360e3096ceb63..d68ea4b5ab733 100644
--- a/pandas/tests/groupby/test_groupby_dropna.py
+++ b/pandas/tests/groupby/test_groupby_dropna.py
@@ -393,75 +393,86 @@ def test_groupby_drop_nan_with_multi_index():
     tm.assert_frame_equal(result, expected)
 
 
+# Test all combinations of values e.g. 1, 2, and NA. Use string labels to
+# correspond to various dtypes. "z" always corresponds to NA.
+@pytest.mark.parametrize("sequence0", ["x", "y", "z"])
+@pytest.mark.parametrize("sequence1", ["x", "y", "z"])
+@pytest.mark.parametrize("sequence2", ["x", "y", "z"])
+@pytest.mark.parametrize("sequence3", ["x", "y", "z"])
 @pytest.mark.parametrize(
-    "values, dtype",
+    "uniques, dtype",
     [
-        ([2, np.nan, 1, 2], None),
-        ([2, np.nan, 1, 2], "UInt8"),
-        ([2, np.nan, 1, 2], "Int8"),
-        ([2, np.nan, 1, 2], "UInt16"),
-        ([2, np.nan, 1, 2], "Int16"),
-        ([2, np.nan, 1, 2], "UInt32"),
-        ([2, np.nan, 1, 2], "Int32"),
-        ([2, np.nan, 1, 2], "UInt64"),
-        ([2, np.nan, 1, 2], "Int64"),
-        ([2, np.nan, 1, 2], "Float32"),
-        ([2, np.nan, 1, 2], "Int64"),
-        ([2, np.nan, 1, 2], "Float64"),
+        ({"x": 1, "y": 2, "z": np.nan}, None),
+        ({"x": 1, "y": 2, "z": pd.NA}, "UInt8"),
+        ({"x": 1, "y": 2, "z": pd.NA}, "Int8"),
+        ({"x": 1, "y": 2, "z": pd.NA}, "UInt16"),
+        ({"x": 1, "y": 2, "z": pd.NA}, "Int16"),
+        ({"x": 1, "y": 2, "z": pd.NA}, "UInt32"),
+        ({"x": 1, "y": 2, "z": pd.NA}, "Int32"),
+        ({"x": 1, "y": 2, "z": pd.NA}, "UInt64"),
+        ({"x": 1, "y": 2, "z": pd.NA}, "Int64"),
+        ({"x": 1, "y": 2, "z": pd.NA}, "Float32"),
+        ({"x": 1, "y": 2, "z": pd.NA}, "Int64"),
+        ({"x": 1, "y": 2, "z": pd.NA}, "Float64"),
+        ({"x": "x", "y": "y", "z": None}, "category"),
+        ({"x": "x", "y": "y", "z": pd.NA}, "string"),
         pytest.param(
-            ["y", None, "x", "y"],
-            "category",
-            marks=pytest.mark.xfail(
-                reason="dropna=False not correct for categorical, GH#48645"
-            ),
-        ),
-        (["y", pd.NA, "x", "y"], "string"),
-        pytest.param(
-            ["y", pd.NA, "x", "y"],
+            {"x": "x", "y": "y", "z": pd.NA},
             "string[pyarrow]",
             marks=pytest.mark.skipif(
                 pa_version_under1p01, reason="pyarrow is not installed"
             ),
         ),
         (
-            ["2016-01-01", np.datetime64("NaT"), "2017-01-01", "2016-01-01"],
+            {"x": "2016-01-01", "y": "2017-01-01", "z": np.datetime64("NaT")},
             "datetime64[ns]",
         ),
         (
-            [
-                pd.Period("2012-02-01", freq="D"),
-                pd.NaT,
-                pd.Period("2012-01-01", freq="D"),
-                pd.Period("2012-02-01", freq="D"),
-            ],
+            {
+                "x": pd.Period("2012-01-01", freq="D"),
+                "y": pd.Period("2012-02-01", freq="D"),
+                "z": pd.NaT,
+            },
             None,
         ),
-        (pd.arrays.SparseArray([2, np.nan, 1, 2]), None),
     ],
 )
-@pytest.mark.parametrize("test_series", [True, False])
-def test_no_sort_keep_na(values, dtype, test_series):
-    # GH#46584
-    key = pd.Series(values, dtype=dtype)
-    df = pd.DataFrame({"key": key, "a": [1, 2, 3, 4]})
+def test_no_sort_keep_na(
+    request, sequence0, sequence1, sequence2, sequence3, uniques, dtype
+):
+    # GH#46584, GH#48794
+    sequence = "".join([sequence0, sequence1, sequence2, sequence3])
+    if dtype == "category" and "z" in sequence:
+        # Only xfail when nulls are present
+        msg = "dropna=False not correct for categorical, GH#48645"
+        request.node.add_marker(pytest.mark.xfail(reason=msg))
+    if dtype == "datetime64[ns]" and sequence == "zzzz":
+        msg = "Cannot construct datetime of all nulls"
+        request.node.add_marker(pytest.mark.xfail(reason=msg))
+    weights = {"x": 1, "y": 2, "z": 3}
+
+    key = pd.Series([uniques[label] for label in sequence], dtype=dtype)
+    df = pd.DataFrame({"key": key, "a": [weights[label] for label in sequence]})
     gb = df.groupby("key", dropna=False, sort=False)
-    if test_series:
-        gb = gb["a"]
-
-    warn = None
-    if isinstance(values, pd.arrays.SparseArray):
-        warn = FutureWarning
-    msg = "passing a SparseArray to pd.Index will store that array directly"
-    with tm.assert_produces_warning(warn, match=msg):
-        result = gb.sum()
-        expected = pd.DataFrame({"a": [5, 2, 3]}, index=key[:-1].rename("key"))
 
-    if test_series:
-        expected = expected["a"]
-    if expected.index.is_categorical():
-        # TODO: Slicing reorders categories?
-        expected.index = expected.index.reorder_categories(["y", "x"])
-    tm.assert_equal(result, expected)
+    result = gb.sum()
+    # Manually compute the groupby sum, use the labels "x", "y", and "z" to avoid
+    # issues with hashing np.nan
+    summed = {}
+    for label in sequence:
+        summed[label] = summed.get(label, 0) + weights[label]
+
+    if dtype == "category":
+        index = pd.CategoricalIndex(
+            [uniques[label] for label in summed],
+            # Get the nonnull categories in the order they appear ignoring duplicates
+            list({uniques[k]: 0 for k in sequence if not pd.isnull(uniques[k])}),
+            name="key",
+        )
+    else:
+        index = pd.Index([uniques[label] for label in summed], dtype=dtype, name="key")
+    expected = pd.Series(summed.values(), index=index, name="a", dtype=None).to_frame()
+    tm.assert_frame_equal(result, expected)
 
 
 @pytest.mark.parametrize("test_series", [True, False])

From 8a107bee5ff59accde019193803220f78369c6ef Mon Sep 17 00:00:00 2001
From: richard <rhshadrach@gmail.com>
Date: Tue, 27 Sep 2022 16:19:42 -0400
Subject: [PATCH 2/4] Rework test

---
 doc/source/whatsnew/v1.5.1.rst              |   3 -
 pandas/tests/groupby/test_groupby_dropna.py | 110 +++++++++++---------
 2 files changed, 58 insertions(+), 55 deletions(-)

diff --git a/doc/source/whatsnew/v1.5.1.rst b/doc/source/whatsnew/v1.5.1.rst
index 77622b25a9404..a733eb63be7d4 100644
--- a/doc/source/whatsnew/v1.5.1.rst
+++ b/doc/source/whatsnew/v1.5.1.rst
@@ -75,9 +75,6 @@ Fixed regressions
 - Fixed regression in :meth:`DataFrame.describe` raising ``TypeError`` when result contains ``NA`` (:issue:`48778`)
 - Fixed regression in :meth:`DataFrame.plot` ignoring invalid ``colormap`` for ``kind="scatter"`` (:issue:`48726`)
 - Fixed performance regression in :func:`factorize` when ``na_sentinel`` is not ``None`` and ``sort=False`` (:issue:`48620`)
-- Fixed regression causing an ``AttributeError`` during warning emitted if the provided table name in :meth:`DataFrame.to_sql` and the table name actually used in the database do not match (:issue:`48733`)
-- Fixed :meth:`.DataFrameGroupBy.size` not returning a Series when ``axis=1`` (:issue:`48738`)
-- Fixed Regression in :meth:`DataFrameGroupBy.apply` when user defined function is called on an empty dataframe (:issue:`47985`)
 - Fixed regression in :meth:`Series.groupby` and :meth:`DataFrame.groupby` when the grouper is a nullable data type (e.g. :class:`Int64`) or a PyArrow-backed string array, contains null values, and ``dropna=False`` (:issue:`48794`)
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
index d68ea4b5ab733..18abb3e30ef45 100644
--- a/pandas/tests/groupby/test_groupby_dropna.py
+++ b/pandas/tests/groupby/test_groupby_dropna.py
@@ -393,86 +393,92 @@ def test_groupby_drop_nan_with_multi_index():
     tm.assert_frame_equal(result, expected)
 
 
-# Test all combinations of values e.g. 1, 2, and NA. Use string labels to
-# correspond to various dtypes. "z" always corresponds to NA.
-@pytest.mark.parametrize("sequence0", ["x", "y", "z"])
-@pytest.mark.parametrize("sequence1", ["x", "y", "z"])
-@pytest.mark.parametrize("sequence2", ["x", "y", "z"])
-@pytest.mark.parametrize("sequence3", ["x", "y", "z"])
+# sequence_index enumerates all strings made up of x, y, z of length 4
+@pytest.mark.parametrize("sequence_index", range(3**4 + 1))
 @pytest.mark.parametrize(
-    "uniques, dtype",
+    "dtype",
     [
-        ({"x": 1, "y": 2, "z": np.nan}, None),
-        ({"x": 1, "y": 2, "z": pd.NA}, "UInt8"),
-        ({"x": 1, "y": 2, "z": pd.NA}, "Int8"),
-        ({"x": 1, "y": 2, "z": pd.NA}, "UInt16"),
-        ({"x": 1, "y": 2, "z": pd.NA}, "Int16"),
-        ({"x": 1, "y": 2, "z": pd.NA}, "UInt32"),
-        ({"x": 1, "y": 2, "z": pd.NA}, "Int32"),
-        ({"x": 1, "y": 2, "z": pd.NA}, "UInt64"),
-        ({"x": 1, "y": 2, "z": pd.NA}, "Int64"),
-        ({"x": 1, "y": 2, "z": pd.NA}, "Float32"),
-        ({"x": 1, "y": 2, "z": pd.NA}, "Int64"),
-        ({"x": 1, "y": 2, "z": pd.NA}, "Float64"),
-        ({"x": "x", "y": "y", "z": None}, "category"),
-        ({"x": "x", "y": "y", "z": pd.NA}, "string"),
+        None,
+        "UInt8",
+        "Int8",
+        "UInt16",
+        "Int16",
+        "UInt32",
+        "Int32",
+        "UInt64",
+        "Int64",
+        "Float32",
+        "Int64",
+        "Float64",
+        "category",
+        "string",
         pytest.param(
-            {"x": "x", "y": "y", "z": pd.NA},
             "string[pyarrow]",
             marks=pytest.mark.skipif(
                 pa_version_under1p01, reason="pyarrow is not installed"
             ),
         ),
-        (
-            {"x": "2016-01-01", "y": "2017-01-01", "z": np.datetime64("NaT")},
-            "datetime64[ns]",
-        ),
-        (
-            {
-                "x": pd.Period("2012-01-01", freq="D"),
-                "y": pd.Period("2012-02-01", freq="D"),
-                "z": pd.NaT,
-            },
-            None,
-        ),
+        "datetime64[ns]",
+        "period[d]",
+        "Sparse[float]",
     ],
 )
-def test_no_sort_keep_na(
-    request, sequence0, sequence1, sequence2, sequence3, uniques, dtype
-):
+@pytest.mark.parametrize("test_series", [True, False])
+def test_no_sort_keep_na(request, sequence_index, dtype, test_series):
     # GH#46584, GH#48794
-    sequence = "".join([sequence0, sequence1, sequence2, sequence3])
+
+    # Convert sequence_index into a string sequence, e.g. 5 becomes "xxyz"
+    # This sequence is used for the grouper.
+    sequence = "".join(
+        [{0: "x", 1: "y", 2: "z"}[sequence_index // (3**k) % 3] for k in range(4)]
+    )
+
     if dtype == "category" and "z" in sequence:
         # Only xfail when nulls are present
         msg = "dropna=False not correct for categorical, GH#48645"
         request.node.add_marker(pytest.mark.xfail(reason=msg))
-    if dtype == "datetime64[ns]" and sequence == "zzzz":
-        msg = "Cannot construct datetime of all nulls"
-        request.node.add_marker(pytest.mark.xfail(reason=msg))
-    weights = {"x": 1, "y": 2, "z": 3}
 
-    key = pd.Series([uniques[label] for label in sequence], dtype=dtype)
-    df = pd.DataFrame({"key": key, "a": [weights[label] for label in sequence]})
-    gb = df.groupby("key", dropna=False, sort=False)
+    # Unique values to use for grouper, depends on dtype
+    if dtype in ("string", "string[pyarrow]"):
+        uniques = {"x": "x", "y": "y", "z": pd.NA}
+    elif dtype in ("datetime64[ns]", "period[d]"):
+        uniques = {"x": "2016-01-01", "y": "2017-01-01", "z": pd.NA}
+    else:
+        uniques = {"x": 1, "y": 2, "z": np.nan}
 
+    df = pd.DataFrame(
+        {
+            "key": pd.Series([uniques[label] for label in sequence], dtype=dtype),
+            "a": [0, 1, 2, 3],
+        }
+    )
+    gb = df.groupby("key", dropna=False, sort=False)
+    if test_series:
+        gb = gb["a"]
     result = gb.sum()
+
     # Manually compute the groupby sum, use the labels "x", "y", and "z" to avoid
     # issues with hashing np.nan
     summed = {}
-    for label in sequence:
-        summed[label] = summed.get(label, 0) + weights[label]
-
+    for idx, label in enumerate(sequence):
+        summed[label] = summed.get(label, 0) + idx
     if dtype == "category":
         index = pd.CategoricalIndex(
-            [uniques[label] for label in summed],
-            # Get the nonnull categories in the order they appear ignoring duplicates
+            [uniques[e] for e in summed],
             list({uniques[k]: 0 for k in sequence if not pd.isnull(uniques[k])}),
             name="key",
         )
+    elif isinstance(dtype, str) and dtype.startswith("Sparse"):
+        index = pd.Index(
+            pd.array([uniques[label] for label in summed], dtype=dtype), name="key"
+        )
     else:
         index = pd.Index([uniques[label] for label in summed], dtype=dtype, name="key")
-    expected = pd.Series(summed.values(), index=index, name="a", dtype=None).to_frame()
-    tm.assert_frame_equal(result, expected)
+    expected = pd.Series(summed.values(), index=index, name="a", dtype=None)
+    if not test_series:
+        expected = expected.to_frame()
+
+    tm.assert_equal(result, expected)
 
 
 @pytest.mark.parametrize("test_series", [True, False])

From 0510d95d90662b4c92f502b7de2bd873c0e89220 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Wed, 28 Sep 2022 18:22:06 -0400
Subject: [PATCH 3/4] Merge cleanup

---
 doc/source/whatsnew/v1.5.1.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/doc/source/whatsnew/v1.5.1.rst b/doc/source/whatsnew/v1.5.1.rst
index a733eb63be7d4..77622b25a9404 100644
--- a/doc/source/whatsnew/v1.5.1.rst
+++ b/doc/source/whatsnew/v1.5.1.rst
@@ -75,6 +75,9 @@ Fixed regressions
 - Fixed regression in :meth:`DataFrame.describe` raising ``TypeError`` when result contains ``NA`` (:issue:`48778`)
 - Fixed regression in :meth:`DataFrame.plot` ignoring invalid ``colormap`` for ``kind="scatter"`` (:issue:`48726`)
 - Fixed performance regression in :func:`factorize` when ``na_sentinel`` is not ``None`` and ``sort=False`` (:issue:`48620`)
+- Fixed regression causing an ``AttributeError`` during warning emitted if the provided table name in :meth:`DataFrame.to_sql` and the table name actually used in the database do not match (:issue:`48733`)
+- Fixed :meth:`.DataFrameGroupBy.size` not returning a Series when ``axis=1`` (:issue:`48738`)
+- Fixed Regression in :meth:`DataFrameGroupBy.apply` when user defined function is called on an empty dataframe (:issue:`47985`)
 - Fixed regression in :meth:`Series.groupby` and :meth:`DataFrame.groupby` when the grouper is a nullable data type (e.g. :class:`Int64`) or a PyArrow-backed string array, contains null values, and ``dropna=False`` (:issue:`48794`)
 
 .. ---------------------------------------------------------------------------

From 9800be72e6e15a36b7773ba5bd4bb74e53463a9a Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Wed, 28 Sep 2022 18:25:13 -0400
Subject: [PATCH 4/4] Range fixup

---
 pandas/tests/groupby/test_groupby_dropna.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
index 18abb3e30ef45..ee660dd073ce9 100644
--- a/pandas/tests/groupby/test_groupby_dropna.py
+++ b/pandas/tests/groupby/test_groupby_dropna.py
@@ -394,7 +394,7 @@ def test_groupby_drop_nan_with_multi_index():
 
 
 # sequence_index enumerates all strings made up of x, y, z of length 4
-@pytest.mark.parametrize("sequence_index", range(3**4 + 1))
+@pytest.mark.parametrize("sequence_index", range(3**4))
 @pytest.mark.parametrize(
     "dtype",
     [