From 0364febd8f2d4d760b42f7abfca8e662f1abf505 Mon Sep 17 00:00:00 2001
From: Luke Manley <lukemanley@gmail.com>
Date: Mon, 31 Jul 2023 22:46:57 -0400
Subject: [PATCH 1/7] PERF: axis=1 reductions with EA dtypes

---
 pandas/core/arrays/masked.py          |  2 +
 pandas/core/frame.py                  | 18 +++++++
 pandas/tests/frame/test_reductions.py | 76 +++++++++++++++++++++++++++
 3 files changed, 96 insertions(+)

diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
index bec875f2bbfa1..6f0dc47cc71a5 100644
--- a/pandas/core/arrays/masked.py
+++ b/pandas/core/arrays/masked.py
@@ -1149,6 +1149,8 @@ def _reduce(
             if isna(result):
                 return self._wrap_na_result(name=name, axis=0, mask_size=(1,))
             else:
+                if isinstance(result, int):
+                    result = np.intp(result)
                 result = result.reshape(1)
                 mask = np.zeros(1, dtype=bool)
                 return self._maybe_mask_result(result, mask)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index e4755d5dd2bdf..b54d8f5c72f54 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -11083,6 +11083,24 @@ def _get_data() -> DataFrame:
                 ).iloc[:0]
                 result.index = df.index
                 return result
+
+            dtype = find_common_type([arr.dtype for arr in df._mgr.arrays])
+            if isinstance(dtype, ExtensionDtype) and name != "kurt":
+                name = {"argmax": "idxmax", "argmin": "idxmin"}.get(name, name)
+                df = df.astype(dtype, copy=False)
+                arr = concat_compat(list(df._iter_column_arrays()))
+                nrows, ncols = df.shape
+                row_index = np.tile(np.arange(nrows), ncols)
+                col_index = np.repeat(np.arange(ncols), nrows)
+                ser = Series(arr, index=col_index)
+                result = ser.groupby(row_index).agg(name, **kwds)
+                result.index = df.index
+                if not skipna and name not in ("any", "all"):
+                    mask = df.isna().to_numpy(dtype=np.bool_).any(axis=1)
+                    other = -1 if name in ("idxmax", "idxmin") else lib.no_default
+                    result = result.mask(mask, other)
+                return result
+
             df = df.T
 
         # After possibly _get_data and transposing, we are now in the
diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py
index d5a741f92db35..f4eb36199bb26 100644
--- a/pandas/tests/frame/test_reductions.py
+++ b/pandas/tests/frame/test_reductions.py
@@ -1935,3 +1935,79 @@ def test_fails_on_non_numeric(kernel):
         msg = "|".join([msg1, msg2])
     with pytest.raises(TypeError, match=msg):
         getattr(df, kernel)(*args)
+
+
+@pytest.mark.parametrize(
+    "method",
+    [
+        "all",
+        "any",
+        "count",
+        "idxmax",
+        "idxmin",
+        "kurt",
+        "kurtosis",
+        "max",
+        "mean",
+        "median",
+        "min",
+        "nunique",
+        "prod",
+        "product",
+        "sem",
+        "skew",
+        "std",
+        "sum",
+        "var",
+    ],
+)
+@pytest.mark.parametrize("min_count", [0, 2])
+def test_numeric_ea_axis_1(method, skipna, min_count, any_numeric_ea_dtype):
+    df = DataFrame(
+        {
+            "a": Series([0, 1, 2, 3], dtype=any_numeric_ea_dtype),
+            "b": Series([0, 1, pd.NA, 3], dtype=any_numeric_ea_dtype),
+        },
+    )
+    expected_df = DataFrame(
+        {
+            "a": [0.0, 1.0, 2.0, 3.0],
+            "b": [0.0, 1.0, np.nan, 3.0],
+        },
+    )
+    if method in ("count", "nunique"):
+        expected_dtype = "int64"
+    elif method in ("all", "any"):
+        expected_dtype = "boolean"
+    elif method in (
+        "kurt",
+        "kurtosis",
+        "mean",
+        "median",
+        "sem",
+        "skew",
+        "std",
+        "var",
+    ) and not any_numeric_ea_dtype.startswith("Float"):
+        expected_dtype = "Float64"
+    else:
+        expected_dtype = any_numeric_ea_dtype
+
+    kwargs = {}
+    if method not in ("count", "nunique", "quantile"):
+        kwargs["skipna"] = skipna
+    if method in ("prod", "product", "sum"):
+        kwargs["min_count"] = min_count
+
+    warn = None
+    msg = None
+    if not skipna and method in ("idxmax", "idxmin"):
+        warn = FutureWarning
+        msg = f"The behavior of DataFrame.{method} with all-NA values"
+    with tm.assert_produces_warning(warn, match=msg):
+        result = getattr(df, method)(axis=1, **kwargs)
+    with tm.assert_produces_warning(warn, match=msg):
+        expected = getattr(expected_df, method)(axis=1, **kwargs)
+    if method not in ("idxmax", "idxmin"):
+        expected = expected.astype(expected_dtype)
+    tm.assert_series_equal(result, expected)

From 114c76491814b65c00f25fcf65246e23d8c298d1 Mon Sep 17 00:00:00 2001
From: Luke Manley <lukemanley@gmail.com>
Date: Mon, 31 Jul 2023 22:52:08 -0400
Subject: [PATCH 2/7] whatsnew

---
 doc/source/whatsnew/v2.1.0.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
index 98465b5686ca1..930ccbf7d3ce0 100644
--- a/doc/source/whatsnew/v2.1.0.rst
+++ b/doc/source/whatsnew/v2.1.0.rst
@@ -539,6 +539,7 @@ Performance improvements
 - Performance improvement in :func:`concat` (:issue:`52291`, :issue:`52290`)
 - :class:`Period`'s default formatter (`period_format`) is now significantly (~twice) faster. This improves performance of ``str(Period)``, ``repr(Period)``, and :meth:`Period.strftime(fmt=None)`, as well as ``PeriodArray.strftime(fmt=None)``, ``PeriodIndex.strftime(fmt=None)`` and ``PeriodIndex.format(fmt=None)``. Finally, ``to_csv`` operations involving :class:`PeriodArray` or :class:`PeriodIndex` with default ``date_format`` are also significantly accelerated. (:issue:`51459`)
 - Performance improvement accessing :attr:`arrays.IntegerArrays.dtype` & :attr:`arrays.FloatingArray.dtype` (:issue:`52998`)
+- Performance improvement in :class:`DataFrame` reductions with ``axis=1`` and extension dtypes (:issue:`54341`)
 - Performance improvement in :class:`DataFrame` reductions with ``axis=None`` and extension dtypes (:issue:`54308`)
 - Performance improvement in :class:`MultiIndex` and multi-column operations (e.g. :meth:`DataFrame.sort_values`, :meth:`DataFrame.groupby`, :meth:`Series.unstack`) when index/column values are already sorted (:issue:`53806`)
 - Performance improvement in :class:`Series` reductions (:issue:`52341`)

From 8335a189eead9288e96d6a92deb98c95b53b92c7 Mon Sep 17 00:00:00 2001
From: Luke Manley <lukemanley@gmail.com>
Date: Tue, 1 Aug 2023 06:49:56 -0400
Subject: [PATCH 3/7] fixes

---
 pandas/core/frame.py           | 33 +++++++++++++++++----------------
 pandas/core/groupby/groupby.py |  5 ++++-
 2 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index b54d8f5c72f54..cd4a6e6ca96ce 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -11084,22 +11084,23 @@ def _get_data() -> DataFrame:
                 result.index = df.index
                 return result
 
-            dtype = find_common_type([arr.dtype for arr in df._mgr.arrays])
-            if isinstance(dtype, ExtensionDtype) and name != "kurt":
-                name = {"argmax": "idxmax", "argmin": "idxmin"}.get(name, name)
-                df = df.astype(dtype, copy=False)
-                arr = concat_compat(list(df._iter_column_arrays()))
-                nrows, ncols = df.shape
-                row_index = np.tile(np.arange(nrows), ncols)
-                col_index = np.repeat(np.arange(ncols), nrows)
-                ser = Series(arr, index=col_index)
-                result = ser.groupby(row_index).agg(name, **kwds)
-                result.index = df.index
-                if not skipna and name not in ("any", "all"):
-                    mask = df.isna().to_numpy(dtype=np.bool_).any(axis=1)
-                    other = -1 if name in ("idxmax", "idxmin") else lib.no_default
-                    result = result.mask(mask, other)
-                return result
+            if df.shape[1] and name != "kurt":
+                dtype = find_common_type([arr.dtype for arr in df._mgr.arrays])
+                if isinstance(dtype, ExtensionDtype):
+                    name = {"argmax": "idxmax", "argmin": "idxmin"}.get(name, name)
+                    df = df.astype(dtype, copy=False)
+                    arr = concat_compat(list(df._iter_column_arrays()))
+                    nrows, ncols = df.shape
+                    row_index = np.tile(np.arange(nrows), ncols)
+                    col_index = np.repeat(np.arange(ncols), nrows)
+                    ser = Series(arr, index=col_index)
+                    result = ser.groupby(row_index).agg(name, **kwds)
+                    result.index = df.index
+                    if not skipna and name not in ("any", "all"):
+                        mask = df.isna().to_numpy(dtype=np.bool_).any(axis=1)
+                        other = -1 if name in ("idxmax", "idxmin") else lib.no_default
+                        result = result.mask(mask, other)
+                    return result
 
             df = df.T
 
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index dbb2d0e25de2e..f12a0849d15bb 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -105,6 +105,7 @@ class providing the base-class of operations.
     ExtensionArray,
     FloatingArray,
     IntegerArray,
+    SparseArray,
 )
 from pandas.core.base import (
     PandasObject,
@@ -1905,7 +1906,9 @@ def array_func(values: ArrayLike) -> ArrayLike:
                 # and non-applicable functions
                 # try to python agg
                 # TODO: shouldn't min_count matter?
-                if how in ["any", "all", "std", "sem"]:
+                if how in ["any", "all"] and isinstance(values, SparseArray):
+                    pass
+                elif how in ["any", "all", "std", "sem"]:
                     raise  # TODO: re-raise as TypeError?  should not be reached
             else:
                 return result

From e51fb7ed9ed59f81d637971076468ccab84c95f0 Mon Sep 17 00:00:00 2001
From: Luke Manley <lukemanley@gmail.com>
Date: Tue, 1 Aug 2023 21:48:23 -0400
Subject: [PATCH 4/7] updates

---
 pandas/core/arrays/masked.py          | 2 --
 pandas/tests/frame/test_reductions.py | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
index 6f0dc47cc71a5..bec875f2bbfa1 100644
--- a/pandas/core/arrays/masked.py
+++ b/pandas/core/arrays/masked.py
@@ -1149,8 +1149,6 @@ def _reduce(
             if isna(result):
                 return self._wrap_na_result(name=name, axis=0, mask_size=(1,))
             else:
-                if isinstance(result, int):
-                    result = np.intp(result)
                 result = result.reshape(1)
                 mask = np.zeros(1, dtype=bool)
                 return self._maybe_mask_result(result, mask)
diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py
index f4eb36199bb26..70b9d642e1504 100644
--- a/pandas/tests/frame/test_reductions.py
+++ b/pandas/tests/frame/test_reductions.py
@@ -1963,6 +1963,7 @@ def test_fails_on_non_numeric(kernel):
 )
 @pytest.mark.parametrize("min_count", [0, 2])
 def test_numeric_ea_axis_1(method, skipna, min_count, any_numeric_ea_dtype):
+    # GH 54341
     df = DataFrame(
         {
             "a": Series([0, 1, 2, 3], dtype=any_numeric_ea_dtype),

From bb37fe11207f0ab1137e6baeb1d0efef2e3da8eb Mon Sep 17 00:00:00 2001
From: Luke Manley <lukemanley@gmail.com>
Date: Tue, 1 Aug 2023 22:45:21 -0400
Subject: [PATCH 5/7] add TODO note

---
 pandas/core/groupby/groupby.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index f12a0849d15bb..e6422c5ba0bba 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -1906,6 +1906,7 @@ def array_func(values: ArrayLike) -> ArrayLike:
                 # and non-applicable functions
                 # try to python agg
                 # TODO: shouldn't min_count matter?
+                # TODO: avoid special casing SparseArray here
                 if how in ["any", "all"] and isinstance(values, SparseArray):
                     pass
                 elif how in ["any", "all", "std", "sem"]:

From 266b2af624994d7d635a0635394b79dd82c5aa50 Mon Sep 17 00:00:00 2001
From: Luke Manley <lukemanley@gmail.com>
Date: Thu, 3 Aug 2023 18:49:43 -0400
Subject: [PATCH 6/7] add comments

---
 pandas/core/frame.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 9e3516ce9b037..d6ef5838cb052 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -11132,9 +11132,16 @@ def _get_data() -> DataFrame:
                 result.index = df.index
                 return result
 
+            # kurtosis excluded since groupby does not implement it
             if df.shape[1] and name != "kurt":
                 dtype = find_common_type([arr.dtype for arr in df._mgr.arrays])
                 if isinstance(dtype, ExtensionDtype):
+                    # GH 54341: fastpath for EA-backed axis=1 reductions
+                    # This flattens the frame into a single 1D array while keeping
+                    # track of the row and column indices of the original frame. Once
+                    # flattened, grouping by the row indices and aggregating should
+                    # be equivalent to transposing the original frame and aggregating
+                    # with axis=0.
                     name = {"argmax": "idxmax", "argmin": "idxmin"}.get(name, name)
                     df = df.astype(dtype, copy=False)
                     arr = concat_compat(list(df._iter_column_arrays()))

From 279766d0b15ba1c80c42488b5e340a3305f57717 Mon Sep 17 00:00:00 2001
From: Luke Manley <lukemanley@gmail.com>
Date: Wed, 9 Aug 2023 08:57:38 -0400
Subject: [PATCH 7/7] add copy=False

---
 pandas/core/frame.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index d6ef5838cb052..1c14c3bfc835c 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -11148,7 +11148,7 @@ def _get_data() -> DataFrame:
                     nrows, ncols = df.shape
                     row_index = np.tile(np.arange(nrows), ncols)
                     col_index = np.repeat(np.arange(ncols), nrows)
-                    ser = Series(arr, index=col_index)
+                    ser = Series(arr, index=col_index, copy=False)
                     result = ser.groupby(row_index).agg(name, **kwds)
                     result.index = df.index
                     if not skipna and name not in ("any", "all"):