From 0364febd8f2d4d760b42f7abfca8e662f1abf505 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Mon, 31 Jul 2023 22:46:57 -0400 Subject: [PATCH 1/7] PERF: axis=1 reductions with EA dtypes --- pandas/core/arrays/masked.py | 2 + pandas/core/frame.py | 18 +++++++ pandas/tests/frame/test_reductions.py | 76 +++++++++++++++++++++++++++ 3 files changed, 96 insertions(+) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index bec875f2bbfa1..6f0dc47cc71a5 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1149,6 +1149,8 @@ def _reduce( if isna(result): return self._wrap_na_result(name=name, axis=0, mask_size=(1,)) else: + if isinstance(result, int): + result = np.intp(result) result = result.reshape(1) mask = np.zeros(1, dtype=bool) return self._maybe_mask_result(result, mask) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e4755d5dd2bdf..b54d8f5c72f54 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -11083,6 +11083,24 @@ def _get_data() -> DataFrame: ).iloc[:0] result.index = df.index return result + + dtype = find_common_type([arr.dtype for arr in df._mgr.arrays]) + if isinstance(dtype, ExtensionDtype) and name != "kurt": + name = {"argmax": "idxmax", "argmin": "idxmin"}.get(name, name) + df = df.astype(dtype, copy=False) + arr = concat_compat(list(df._iter_column_arrays())) + nrows, ncols = df.shape + row_index = np.tile(np.arange(nrows), ncols) + col_index = np.repeat(np.arange(ncols), nrows) + ser = Series(arr, index=col_index) + result = ser.groupby(row_index).agg(name, **kwds) + result.index = df.index + if not skipna and name not in ("any", "all"): + mask = df.isna().to_numpy(dtype=np.bool_).any(axis=1) + other = -1 if name in ("idxmax", "idxmin") else lib.no_default + result = result.mask(mask, other) + return result + df = df.T # After possibly _get_data and transposing, we are now in the diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index d5a741f92db35..f4eb36199bb26 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1935,3 +1935,79 @@ def test_fails_on_non_numeric(kernel): msg = "|".join([msg1, msg2]) with pytest.raises(TypeError, match=msg): getattr(df, kernel)(*args) + + +@pytest.mark.parametrize( + "method", + [ + "all", + "any", + "count", + "idxmax", + "idxmin", + "kurt", + "kurtosis", + "max", + "mean", + "median", + "min", + "nunique", + "prod", + "product", + "sem", + "skew", + "std", + "sum", + "var", + ], +) +@pytest.mark.parametrize("min_count", [0, 2]) +def test_numeric_ea_axis_1(method, skipna, min_count, any_numeric_ea_dtype): + df = DataFrame( + { + "a": Series([0, 1, 2, 3], dtype=any_numeric_ea_dtype), + "b": Series([0, 1, pd.NA, 3], dtype=any_numeric_ea_dtype), + }, + ) + expected_df = DataFrame( + { + "a": [0.0, 1.0, 2.0, 3.0], + "b": [0.0, 1.0, np.nan, 3.0], + }, + ) + if method in ("count", "nunique"): + expected_dtype = "int64" + elif method in ("all", "any"): + expected_dtype = "boolean" + elif method in ( + "kurt", + "kurtosis", + "mean", + "median", + "sem", + "skew", + "std", + "var", + ) and not any_numeric_ea_dtype.startswith("Float"): + expected_dtype = "Float64" + else: + expected_dtype = any_numeric_ea_dtype + + kwargs = {} + if method not in ("count", "nunique", "quantile"): + kwargs["skipna"] = skipna + if method in ("prod", "product", "sum"): + kwargs["min_count"] = min_count + + warn = None + msg = None + if not skipna and method in ("idxmax", "idxmin"): + warn = FutureWarning + msg = f"The behavior of DataFrame.{method} with all-NA values" + with tm.assert_produces_warning(warn, match=msg): + result = getattr(df, method)(axis=1, **kwargs) + with tm.assert_produces_warning(warn, match=msg): + expected = getattr(expected_df, method)(axis=1, **kwargs) + if method not in ("idxmax", "idxmin"): + expected = expected.astype(expected_dtype) + tm.assert_series_equal(result, expected) From 114c76491814b65c00f25fcf65246e23d8c298d1 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Mon, 31 Jul 2023 22:52:08 -0400 Subject: [PATCH 2/7] whatsnew --- doc/source/whatsnew/v2.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 98465b5686ca1..930ccbf7d3ce0 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -539,6 +539,7 @@ Performance improvements - Performance improvement in :func:`concat` (:issue:`52291`, :issue:`52290`) - :class:`Period`'s default formatter (`period_format`) is now significantly (~twice) faster. This improves performance of ``str(Period)``, ``repr(Period)``, and :meth:`Period.strftime(fmt=None)`, as well as ``PeriodArray.strftime(fmt=None)``, ``PeriodIndex.strftime(fmt=None)`` and ``PeriodIndex.format(fmt=None)``. Finally, ``to_csv`` operations involving :class:`PeriodArray` or :class:`PeriodIndex` with default ``date_format`` are also significantly accelerated. (:issue:`51459`) - Performance improvement accessing :attr:`arrays.IntegerArrays.dtype` & :attr:`arrays.FloatingArray.dtype` (:issue:`52998`) +- Performance improvement in :class:`DataFrame` reductions with ``axis=1`` and extension dtypes (:issue:`54341`) - Performance improvement in :class:`DataFrame` reductions with ``axis=None`` and extension dtypes (:issue:`54308`) - Performance improvement in :class:`MultiIndex` and multi-column operations (e.g. :meth:`DataFrame.sort_values`, :meth:`DataFrame.groupby`, :meth:`Series.unstack`) when index/column values are already sorted (:issue:`53806`) - Performance improvement in :class:`Series` reductions (:issue:`52341`) From 8335a189eead9288e96d6a92deb98c95b53b92c7 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 1 Aug 2023 06:49:56 -0400 Subject: [PATCH 3/7] fixes --- pandas/core/frame.py | 33 +++++++++++++++++---------------- pandas/core/groupby/groupby.py | 5 ++++- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b54d8f5c72f54..cd4a6e6ca96ce 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -11084,22 +11084,23 @@ def _get_data() -> DataFrame: result.index = df.index return result - dtype = find_common_type([arr.dtype for arr in df._mgr.arrays]) - if isinstance(dtype, ExtensionDtype) and name != "kurt": - name = {"argmax": "idxmax", "argmin": "idxmin"}.get(name, name) - df = df.astype(dtype, copy=False) - arr = concat_compat(list(df._iter_column_arrays())) - nrows, ncols = df.shape - row_index = np.tile(np.arange(nrows), ncols) - col_index = np.repeat(np.arange(ncols), nrows) - ser = Series(arr, index=col_index) - result = ser.groupby(row_index).agg(name, **kwds) - result.index = df.index - if not skipna and name not in ("any", "all"): - mask = df.isna().to_numpy(dtype=np.bool_).any(axis=1) - other = -1 if name in ("idxmax", "idxmin") else lib.no_default - result = result.mask(mask, other) - return result + if df.shape[1] and name != "kurt": + dtype = find_common_type([arr.dtype for arr in df._mgr.arrays]) + if isinstance(dtype, ExtensionDtype): + name = {"argmax": "idxmax", "argmin": "idxmin"}.get(name, name) + df = df.astype(dtype, copy=False) + arr = concat_compat(list(df._iter_column_arrays())) + nrows, ncols = df.shape + row_index = np.tile(np.arange(nrows), ncols) + col_index = np.repeat(np.arange(ncols), nrows) + ser = Series(arr, index=col_index) + result = ser.groupby(row_index).agg(name, **kwds) + result.index = df.index + if not skipna and name not in ("any", "all"): + mask = df.isna().to_numpy(dtype=np.bool_).any(axis=1) + other = -1 if name in ("idxmax", "idxmin") else lib.no_default + result = result.mask(mask, other) + return result df = df.T diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index dbb2d0e25de2e..f12a0849d15bb 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -105,6 +105,7 @@ class providing the base-class of operations. ExtensionArray, FloatingArray, IntegerArray, + SparseArray, ) from pandas.core.base import ( PandasObject, @@ -1905,7 +1906,9 @@ def array_func(values: ArrayLike) -> ArrayLike: # and non-applicable functions # try to python agg # TODO: shouldn't min_count matter? - if how in ["any", "all", "std", "sem"]: + if how in ["any", "all"] and isinstance(values, SparseArray): + pass + elif how in ["any", "all", "std", "sem"]: raise # TODO: re-raise as TypeError? should not be reached else: return result From e51fb7ed9ed59f81d637971076468ccab84c95f0 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 1 Aug 2023 21:48:23 -0400 Subject: [PATCH 4/7] updates --- pandas/core/arrays/masked.py | 2 -- pandas/tests/frame/test_reductions.py | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 6f0dc47cc71a5..bec875f2bbfa1 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1149,8 +1149,6 @@ def _reduce( if isna(result): return self._wrap_na_result(name=name, axis=0, mask_size=(1,)) else: - if isinstance(result, int): - result = np.intp(result) result = result.reshape(1) mask = np.zeros(1, dtype=bool) return self._maybe_mask_result(result, mask) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index f4eb36199bb26..70b9d642e1504 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1963,6 +1963,7 @@ def test_fails_on_non_numeric(kernel): ) @pytest.mark.parametrize("min_count", [0, 2]) def test_numeric_ea_axis_1(method, skipna, min_count, any_numeric_ea_dtype): + # GH 54341 df = DataFrame( { "a": Series([0, 1, 2, 3], dtype=any_numeric_ea_dtype), From bb37fe11207f0ab1137e6baeb1d0efef2e3da8eb Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 1 Aug 2023 22:45:21 -0400 Subject: [PATCH 5/7] add TODO note --- pandas/core/groupby/groupby.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f12a0849d15bb..e6422c5ba0bba 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1906,6 +1906,7 @@ def array_func(values: ArrayLike) -> ArrayLike: # and non-applicable functions # try to python agg # TODO: shouldn't min_count matter? + # TODO: avoid special casing SparseArray here if how in ["any", "all"] and isinstance(values, SparseArray): pass elif how in ["any", "all", "std", "sem"]: From 266b2af624994d7d635a0635394b79dd82c5aa50 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 3 Aug 2023 18:49:43 -0400 Subject: [PATCH 6/7] add comments --- pandas/core/frame.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9e3516ce9b037..d6ef5838cb052 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -11132,9 +11132,16 @@ def _get_data() -> DataFrame: result.index = df.index return result + # kurtosis excluded since groupby does not implement it if df.shape[1] and name != "kurt": dtype = find_common_type([arr.dtype for arr in df._mgr.arrays]) if isinstance(dtype, ExtensionDtype): + # GH 54341: fastpath for EA-backed axis=1 reductions + # This flattens the frame into a single 1D array while keeping + # track of the row and column indices of the original frame. Once + # flattened, grouping by the row indices and aggregating should + # be equivalent to transposing the original frame and aggregating + # with axis=0. name = {"argmax": "idxmax", "argmin": "idxmin"}.get(name, name) df = df.astype(dtype, copy=False) arr = concat_compat(list(df._iter_column_arrays())) From 279766d0b15ba1c80c42488b5e340a3305f57717 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 9 Aug 2023 08:57:38 -0400 Subject: [PATCH 7/7] add copy=False --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d6ef5838cb052..1c14c3bfc835c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -11148,7 +11148,7 @@ def _get_data() -> DataFrame: nrows, ncols = df.shape row_index = np.tile(np.arange(nrows), ncols) col_index = np.repeat(np.arange(ncols), nrows) - ser = Series(arr, index=col_index) + ser = Series(arr, index=col_index, copy=False) result = ser.groupby(row_index).agg(name, **kwds) result.index = df.index if not skipna and name not in ("any", "all"):