From 572e923bd766f8fc0e933b0e8be02a4ff1a348b0 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Sat, 15 Feb 2025 09:09:29 -0500
Subject: [PATCH 1/2] BUG(string dtype): Empty sum produces incorrect result

---
 doc/source/whatsnew/v2.3.0.rst                |  1 +
 pandas/core/arrays/base.py                    |  8 +++++-
 pandas/tests/frame/test_reductions.py         | 10 ++++++++
 pandas/tests/groupby/test_reductions.py       | 14 +++++++++++
 pandas/tests/resample/test_base.py            | 25 +++++++++++++++++++
 .../tests/resample/test_resampler_grouper.py  | 20 +++++++++++++++
 6 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
index 8bdddb5b7f85d..041d06b31c332 100644
--- a/doc/source/whatsnew/v2.3.0.rst
+++ b/doc/source/whatsnew/v2.3.0.rst
@@ -113,6 +113,7 @@ Strings
 - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
 - Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
 - Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)
+- Bug in :meth:`DataFrame.sum` with ``axis=1``, :meth:`.DataFrameGroupBy.sum` or :meth:`.SeriesGroupBy.sum` with ``skipna=True``, and :meth:`.Resampler.sum` on :class:`StringDtype` with all NA values resulted in ``0`` and is now the empty string ``""`` (:issue:`???`)
 
 Interval
 ^^^^^^^^
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index 33745438e2aea..edb8a06f4785c 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -2628,7 +2628,13 @@ def _groupby_op(
             if op.how not in ["any", "all"]:
                 # Fail early to avoid conversion to object
                 op._get_cython_function(op.kind, op.how, np.dtype(object), False)
-            npvalues = self.to_numpy(object, na_value=np.nan)
+
+            arr = self
+            if op.how == "sum":
+                assert "skipna" in kwargs
+                if kwargs["skipna"] and min_count == 0:
+                    arr = arr.fillna("")
+            npvalues = arr.to_numpy(object, na_value=np.nan)
         else:
             raise NotImplementedError(
                 f"function is not implemented for this dtype: {self.dtype}"
diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py
index 64e686d25faa7..0b90830e871d8 100644
--- a/pandas/tests/frame/test_reductions.py
+++ b/pandas/tests/frame/test_reductions.py
@@ -835,6 +835,16 @@ def test_axis_1_empty(self, all_reductions, index):
         expected = Series([], index=index, dtype=expected_dtype)
         tm.assert_series_equal(result, expected)
 
+    @pytest.mark.parametrize("min_count", [0, 1])
+    def test_axis_1_sum_na(self, string_dtype_no_object, skipna, min_count):
+        # GH#???
+        dtype = string_dtype_no_object
+        df = DataFrame({"a": [pd.NA]}, dtype=dtype)
+        result = df.sum(axis=1, skipna=skipna, min_count=min_count)
+        value = "" if skipna and min_count == 0 else pd.NA
+        expected = Series([value], dtype=dtype)
+        tm.assert_series_equal(result, expected)
+
     @pytest.mark.parametrize("method, unit", [("sum", 0), ("prod", 1)])
     @pytest.mark.parametrize("numeric_only", [None, True, False])
     def test_sum_prod_nanops(self, method, unit, numeric_only):
diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py
index ea876cfdf4933..53130bdab38ae 100644
--- a/pandas/tests/groupby/test_reductions.py
+++ b/pandas/tests/groupby/test_reductions.py
@@ -955,6 +955,20 @@ def test_min_empty_string_dtype(func, string_dtype_no_object):
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.parametrize("min_count", [0, 1])
+def test_string_dtype_empty_sum(string_dtype_no_object, skipna, min_count):
+    # GH#???
+    dtype = string_dtype_no_object
+    df = DataFrame({"a": ["x"], "b": [pd.NA]}, dtype=dtype)
+    gb = df.groupby("a")
+    result = gb.sum(skipna=skipna, min_count=min_count)
+    value = "" if skipna and min_count == 0 else pd.NA
+    expected = DataFrame(
+        {"b": value}, index=pd.Index(["x"], name="a", dtype=dtype), dtype=dtype
+    )
+    tm.assert_frame_equal(result, expected)
+
+
 def test_max_nan_bug():
     df = DataFrame(
         {
diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py
index b2d9f6c0e3eb0..a04a3b742471f 100644
--- a/pandas/tests/resample/test_base.py
+++ b/pandas/tests/resample/test_base.py
@@ -223,6 +223,31 @@ def test_resample_empty_series(freq, index, resample_method):
     assert result.index.freq == expected.index.freq
 
 
+@pytest.mark.parametrize("min_count", [0, 1])
+def test_resample_empty_sum_string(string_dtype_no_object, min_count):
+    # GH#???
+    dtype = string_dtype_no_object
+    ser = Series(
+        pd.NA,
+        index=DatetimeIndex(
+            [
+                "2000-01-01 00:00:00",
+                "2000-01-01 00:00:10",
+                "2000-01-01 00:00:20",
+                "2000-01-01 00:00:30",
+            ]
+        ),
+        dtype=dtype,
+    )
+    rs = ser.resample("20s")
+    result = rs.sum(min_count=min_count)
+
+    value = "" if min_count == 0 else pd.NA
+    index = date_range(start="2000-01-01", freq="20s", periods=2, unit="s")
+    expected = Series(value, index=index, dtype=dtype)
+    tm.assert_series_equal(result, expected)
+
+
 @pytest.mark.parametrize(
     "freq",
     [
diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py
index e7850f96b3b0f..bc32d3ecb1a62 100644
--- a/pandas/tests/resample/test_resampler_grouper.py
+++ b/pandas/tests/resample/test_resampler_grouper.py
@@ -494,6 +494,26 @@ def test_resample_groupby_agg_object_dtype_all_nan(consolidate):
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.parametrize("min_count", [0, 1])
+def test_groupby_resample_empty_sum_string(
+    string_dtype_no_object, test_frame, min_count
+):
+    # GH#???
+    dtype = string_dtype_no_object
+    test_frame = test_frame.assign(B=pd.array([pd.NA] * len(test_frame), dtype=dtype))
+    gbrs = test_frame.groupby("A").resample("40s")
+    result = gbrs.sum(min_count=min_count)
+
+    index = pd.MultiIndex(
+        levels=[[1, 2, 3], [pd.to_datetime("2000-01-01", unit="ns")]],
+        codes=[[0, 1, 2], [0, 0, 0]],
+        names=["A", None],
+    )
+    value = "" if min_count == 0 else pd.NA
+    expected = DataFrame({"B": value}, index=index, dtype=dtype)
+    tm.assert_frame_equal(result, expected)
+
+
 def test_groupby_resample_with_list_of_keys():
     # GH 47362
     df = DataFrame(

From 07bb2ae80572f5e07710705c8b566b0b09544053 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Sat, 15 Feb 2025 09:16:27 -0500
Subject: [PATCH 2/2] GH refs

---
 doc/source/whatsnew/v2.3.0.rst                  | 2 +-
 pandas/core/arrays/base.py                      | 2 ++
 pandas/tests/frame/test_reductions.py           | 2 +-
 pandas/tests/groupby/test_reductions.py         | 2 +-
 pandas/tests/resample/test_base.py              | 2 +-
 pandas/tests/resample/test_resampler_grouper.py | 2 +-
 6 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
index 041d06b31c332..54dccb97bc9eb 100644
--- a/doc/source/whatsnew/v2.3.0.rst
+++ b/doc/source/whatsnew/v2.3.0.rst
@@ -107,13 +107,13 @@ Conversion
 
 Strings
 ^^^^^^^
+- Bug in :meth:`DataFrame.sum` with ``axis=1``, :meth:`.DataFrameGroupBy.sum` or :meth:`.SeriesGroupBy.sum` with ``skipna=True``, and :meth:`.Resampler.sum` on :class:`StringDtype` with all NA values resulted in ``0`` and is now the empty string ``""`` (:issue:`60229`)
 - Bug in :meth:`Series.__pos__` and :meth:`DataFrame.__pos__` did not raise for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`60710`)
 - Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`)
 - Bug in :meth:`Series.replace` with :class:`StringDtype` when replacing with a non-string value was not upcasting to ``object`` dtype (:issue:`60282`)
 - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
 - Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
 - Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)
-- Bug in :meth:`DataFrame.sum` with ``axis=1``, :meth:`.DataFrameGroupBy.sum` or :meth:`.SeriesGroupBy.sum` with ``skipna=True``, and :meth:`.Resampler.sum` on :class:`StringDtype` with all NA values resulted in ``0`` and is now the empty string ``""`` (:issue:`???`)
 
 Interval
 ^^^^^^^^
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index edb8a06f4785c..dbf2090e53579 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -2631,6 +2631,8 @@ def _groupby_op(
 
             arr = self
             if op.how == "sum":
+                # https://github.com/pandas-dev/pandas/issues/60229
+                # All NA should result in the empty string.
                 assert "skipna" in kwargs
                 if kwargs["skipna"] and min_count == 0:
                     arr = arr.fillna("")
diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py
index 0b90830e871d8..127f0fc50a747 100644
--- a/pandas/tests/frame/test_reductions.py
+++ b/pandas/tests/frame/test_reductions.py
@@ -837,7 +837,7 @@ def test_axis_1_empty(self, all_reductions, index):
 
     @pytest.mark.parametrize("min_count", [0, 1])
     def test_axis_1_sum_na(self, string_dtype_no_object, skipna, min_count):
-        # GH#???
+        # https://github.com/pandas-dev/pandas/issues/60229
         dtype = string_dtype_no_object
         df = DataFrame({"a": [pd.NA]}, dtype=dtype)
         result = df.sum(axis=1, skipna=skipna, min_count=min_count)
diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py
index 53130bdab38ae..45047fe004aa0 100644
--- a/pandas/tests/groupby/test_reductions.py
+++ b/pandas/tests/groupby/test_reductions.py
@@ -957,7 +957,7 @@ def test_min_empty_string_dtype(func, string_dtype_no_object):
 
 @pytest.mark.parametrize("min_count", [0, 1])
 def test_string_dtype_empty_sum(string_dtype_no_object, skipna, min_count):
-    # GH#???
+    # https://github.com/pandas-dev/pandas/issues/60229
     dtype = string_dtype_no_object
     df = DataFrame({"a": ["x"], "b": [pd.NA]}, dtype=dtype)
     gb = df.groupby("a")
diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py
index a04a3b742471f..0db5c0c82d4d4 100644
--- a/pandas/tests/resample/test_base.py
+++ b/pandas/tests/resample/test_base.py
@@ -225,7 +225,7 @@ def test_resample_empty_series(freq, index, resample_method):
 
 @pytest.mark.parametrize("min_count", [0, 1])
 def test_resample_empty_sum_string(string_dtype_no_object, min_count):
-    # GH#???
+    # https://github.com/pandas-dev/pandas/issues/60229
     dtype = string_dtype_no_object
     ser = Series(
         pd.NA,
diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py
index bc32d3ecb1a62..7870c5a9d3e17 100644
--- a/pandas/tests/resample/test_resampler_grouper.py
+++ b/pandas/tests/resample/test_resampler_grouper.py
@@ -498,7 +498,7 @@ def test_resample_groupby_agg_object_dtype_all_nan(consolidate):
 def test_groupby_resample_empty_sum_string(
     string_dtype_no_object, test_frame, min_count
 ):
-    # GH#???
+    # https://github.com/pandas-dev/pandas/issues/60229
     dtype = string_dtype_no_object
     test_frame = test_frame.assign(B=pd.array([pd.NA] * len(test_frame), dtype=dtype))
     gbrs = test_frame.groupby("A").resample("40s")