From f4b55c28317b43bddec9f0c006808e574940a6cd Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 18 Jan 2023 19:56:49 +0100 Subject: [PATCH 1/3] PERF: Avoid re-computing mask in nanmedian --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/nanops.py | 13 ++++++++----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 1a071ab978de9..b5df05cce8003 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -870,6 +870,7 @@ Performance improvements - Performance improvement in :func:`read_html` when there are multiple tables (:issue:`49929`) - Performance improvement in :func:`to_datetime` when using ``'%Y%m%d'`` format (:issue:`17410`) - Performance improvement in :func:`to_datetime` when format is given or can be inferred (:issue:`50465`) +- Performance improvement in :meth:`Series.median` for nullable dtypes (:issue:`1`) - Performance improvement in :func:`read_csv` when passing :func:`to_datetime` lambda-function to ``date_parser`` and inputs have mixed timezone offsetes (:issue:`35296`) - Performance improvement in :meth:`.SeriesGroupBy.value_counts` with categorical dtype (:issue:`46202`) - Fixed a reference leak in :func:`read_hdf` (:issue:`37441`) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 02372356d3fe4..4cba1e41d1a63 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -746,16 +746,19 @@ def nanmedian(values, *, axis: AxisInt | None = None, skipna: bool = True, mask= 2.0 """ - def get_median(x): - mask = notna(x) - if not skipna and not mask.all(): + def get_median(x, _mask): + if _mask is None: + _mask = notna(x) + else: + _mask = ~_mask + if not skipna and not _mask.all(): return np.nan with warnings.catch_warnings(): # Suppress RuntimeWarning about All-NaN slice warnings.filterwarnings( "ignore", "All-NaN slice encountered", RuntimeWarning ) - res = np.nanmedian(x[mask]) + res = np.nanmedian(x[_mask]) return res values, mask, dtype, _, _ = _get_values(values, skipna, mask=mask) @@ -796,7 +799,7 @@ def get_median(x): else: # otherwise return a scalar value - res = get_median(values) if notempty else np.nan + res = get_median(values, mask) if notempty else np.nan return _wrap_results(res, dtype) From 60d63716d225054bd296d48fb3b931dbf51dead6 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 18 Jan 2023 19:57:33 +0100 Subject: [PATCH 2/3] Add gh ref --- doc/source/whatsnew/v2.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index b5df05cce8003..9366404445fb2 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -870,7 +870,7 @@ Performance improvements - Performance improvement in :func:`read_html` when there are multiple tables (:issue:`49929`) - Performance improvement in :func:`to_datetime` when using ``'%Y%m%d'`` format (:issue:`17410`) - Performance improvement in :func:`to_datetime` when format is given or can be inferred (:issue:`50465`) -- Performance improvement in :meth:`Series.median` for nullable dtypes (:issue:`1`) +- Performance improvement in :meth:`Series.median` for nullable dtypes (:issue:`50838`) - Performance improvement in :func:`read_csv` when passing :func:`to_datetime` lambda-function to ``date_parser`` and inputs have mixed timezone offsetes (:issue:`35296`) - Performance improvement in :meth:`.SeriesGroupBy.value_counts` with categorical dtype (:issue:`46202`) - Fixed a reference leak in :func:`read_hdf` (:issue:`37441`) From 55532e273fcbabca5e4605879a78eef6dd890440 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 18 Jan 2023 21:28:12 +0100 Subject: [PATCH 3/3] Fix --- pandas/core/nanops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 4cba1e41d1a63..61f1bcdef9568 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -746,7 +746,7 @@ def nanmedian(values, *, axis: AxisInt | None = None, skipna: bool = True, mask= 2.0 """ - def get_median(x, _mask): + def get_median(x, _mask=None): if _mask is None: _mask = notna(x) else: