From eb5a40d6f53d1bb43b0249cf858e1c7c7ea62f18 Mon Sep 17 00:00:00 2001 From: Akshat Jain Date: Tue, 16 Jun 2020 08:23:30 +0530 Subject: [PATCH 01/16] ENH: add masked mean function --- pandas/core/array_algos/masked_reductions.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index bce6f1aafb2c5..fb653efb0044f 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -107,3 +107,7 @@ def min(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): def max(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): return _minmax(np.max, values=values, mask=mask, skipna=skipna) + + +def mean(values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0): + return sum(values=values, mask=mask, skipna=skipna, min_count=min_count) / np.count_nonzero(~mask) \ No newline at end of file From 8788b15f046a2dc568f0f87aa881955d96614437 Mon Sep 17 00:00:00 2001 From: Akshat Jain Date: Tue, 16 Jun 2020 08:41:49 +0530 Subject: [PATCH 02/16] Indentation --- pandas/core/array_algos/masked_reductions.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index fb653efb0044f..0f04723d93799 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -110,4 +110,5 @@ def max(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): def mean(values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0): - return sum(values=values, mask=mask, skipna=skipna, min_count=min_count) / np.count_nonzero(~mask) \ No newline at end of file + return sum(values=values, mask=mask, skipna=skipna, min_count=min_count) / np.count_nonzero(~mask) + From 30ea64601f7e4d6df8805661b424788f029bc340 Mon Sep 17 00:00:00 2001 From: Akshat Jain Date: Tue, 16 Jun 2020 13:13:17 +0530 Subject: [PATCH 03/16] ENH: masked mean functioning --- pandas/core/array_algos/masked_reductions.py | 5 ++++- pandas/core/arrays/masked.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 0f04723d93799..298fdb9a845bc 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -110,5 +110,8 @@ def max(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): def mean(values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0): - return sum(values=values, mask=mask, skipna=skipna, min_count=min_count) / np.count_nonzero(~mask) + return _sumprod( + np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count + ) / np.count_nonzero(~mask) + diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 7821f103909da..3cf25847ed3d0 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -394,7 +394,7 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): data = self._data mask = self._mask - if name in {"sum", "prod", "min", "max"}: + if name in {"sum", "prod", "min", "max", "mean"}: op = getattr(masked_reductions, name) return op(data, mask, skipna=skipna, **kwargs) From 98f013a87bf59ef48cf265053d748ac035bf82e5 Mon Sep 17 00:00:00 2001 From: Akshat Jain Date: Tue, 16 Jun 2020 13:39:21 +0530 Subject: [PATCH 04/16] Blankline error correction --- pandas/core/array_algos/masked_reductions.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 298fdb9a845bc..4588f130be305 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -113,5 +113,3 @@ def mean(values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: i return _sumprod( np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count ) / np.count_nonzero(~mask) - - From e017657aef201dd944c4d5687f6949215e6b5c68 Mon Sep 17 00:00:00 2001 From: Akshat Jain Date: Wed, 17 Jun 2020 18:02:22 +0530 Subject: [PATCH 05/16] Separate lines in calculation --- pandas/core/array_algos/masked_reductions.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 4588f130be305..929e5486030b3 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -109,7 +109,9 @@ def max(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): return _minmax(np.max, values=values, mask=mask, skipna=skipna) -def mean(values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0): - return _sumprod( - np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count - ) / np.count_nonzero(~mask) +def mean(values: np.ndarray, mask: np.ndarray, skipna: bool = True): + + _sum = _sumprod(np.sum, values=values, mask=mask, skipna=skipna) + count = np.count_nonzero(~mask) + mean_value = _sum / count + return mean_value From 9f16d2758dd3b6c82e00bf2832b07883f567f9f5 Mon Sep 17 00:00:00 2001 From: Akshat Jain Date: Tue, 15 Sep 2020 23:19:30 +0530 Subject: [PATCH 06/16] modified --- pandas/core/array_algos/masked_reductions.py | 2 +- pandas/tests/reductions/test_reductions.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 929e5486030b3..1c311bfdbbaa1 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -110,7 +110,7 @@ def max(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): def mean(values: np.ndarray, mask: np.ndarray, skipna: bool = True): - + _sum = _sumprod(np.sum, values=values, mask=mask, skipna=skipna) count = np.count_nonzero(~mask) mean_value = _sum / count diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 8c2297699807d..18d05b1b64c46 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -689,6 +689,12 @@ def test_ops_consistency_on_empty(self, method): result = getattr(Series(dtype=float), method)() assert pd.isna(result) + # Empty Mean + if method == "mean": + s = Series([], dtype=float) + result = getattr(s, method)() + assert np.isnan(result) + # timedelta64[ns] tdser = Series([], dtype="m8[ns]") if method == "var": From 123838dfa73b00b0913c6280de8e7f3744982e0a Mon Sep 17 00:00:00 2001 From: Akshat Jain Date: Tue, 15 Sep 2020 23:53:35 +0530 Subject: [PATCH 07/16] added empty values check --- pandas/core/array_algos/masked_reductions.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 1c311bfdbbaa1..babeea751066f 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -110,7 +110,8 @@ def max(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): def mean(values: np.ndarray, mask: np.ndarray, skipna: bool = True): - + if not values.size: + return libmissing.NA _sum = _sumprod(np.sum, values=values, mask=mask, skipna=skipna) count = np.count_nonzero(~mask) mean_value = _sum / count From 95ae20c8d399a129c5e96b3dc14e1589fed20f3b Mon Sep 17 00:00:00 2001 From: Akshat Jain Date: Thu, 24 Sep 2020 09:17:55 +0530 Subject: [PATCH 08/16] fixed linting error and removed redundant test --- pandas/core/array_algos/masked_reductions.py | 2 +- pandas/tests/reductions/test_reductions.py | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index babeea751066f..ec0f2c61e0a29 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -110,7 +110,7 @@ def max(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): def mean(values: np.ndarray, mask: np.ndarray, skipna: bool = True): - if not values.size: + if not values.size or mask.all(): return libmissing.NA _sum = _sumprod(np.sum, values=values, mask=mask, skipna=skipna) count = np.count_nonzero(~mask) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 18d05b1b64c46..8c2297699807d 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -689,12 +689,6 @@ def test_ops_consistency_on_empty(self, method): result = getattr(Series(dtype=float), method)() assert pd.isna(result) - # Empty Mean - if method == "mean": - s = Series([], dtype=float) - result = getattr(s, method)() - assert np.isnan(result) - # timedelta64[ns] tdser = Series([], dtype="m8[ns]") if method == "var": From 9948bbe22fd4d2c99ed9b948353dcc1636a3f5d0 Mon Sep 17 00:00:00 2001 From: Akshat Jain Date: Fri, 30 Oct 2020 16:12:13 +0530 Subject: [PATCH 09/16] tests on empty and nan for masked series --- pandas/tests/reductions/test_reductions.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 8c2297699807d..36f9a626d84b1 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -689,6 +689,16 @@ def test_ops_consistency_on_empty(self, method): result = getattr(Series(dtype=float), method)() assert pd.isna(result) + # Nullable dtype on mean + eser = Series([], dtype="Int64") + nser = Series([np.nan], dtype="Int64") + if method == "mean": + result = getattr(eser, method)() + assert pd.isna(result) + + result = getattr(nser, method)() + assert pd.isna(result) + # timedelta64[ns] tdser = Series([], dtype="m8[ns]") if method == "var": From a9b4287b81375aa6a87564e9aacba827cb7cc08a Mon Sep 17 00:00:00 2001 From: Akshat Jain Date: Sat, 31 Oct 2020 00:03:57 +0530 Subject: [PATCH 10/16] modified assert statement --- pandas/tests/reductions/test_reductions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 36f9a626d84b1..a995af9348b95 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -694,10 +694,10 @@ def test_ops_consistency_on_empty(self, method): nser = Series([np.nan], dtype="Int64") if method == "mean": result = getattr(eser, method)() - assert pd.isna(result) + assert result is pd.NA result = getattr(nser, method)() - assert pd.isna(result) + assert result is pd.NA # timedelta64[ns] tdser = Series([], dtype="m8[ns]") From 13b985ac6a3119359986ba73f0893ce4c250ce73 Mon Sep 17 00:00:00 2001 From: Akshat Jain Date: Sun, 13 Dec 2020 20:04:27 +0530 Subject: [PATCH 11/16] added parameterized test for empty/all-na series mean --- pandas/tests/reductions/test_reductions.py | 27 ++++++++++++++-------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index a995af9348b95..88da618e1d4c5 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -679,6 +679,23 @@ def test_empty_multi(self, method, unit): expected = Series([1, np.nan], index=["a", "b"]) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("method", ["mean"]) + @pytest.mark.parametrize("dtype", ["float64", "Int64", "boolean", "object"]) + def test_ops_consistency_mean(self, method, dtype): + + # GH#34814 + # consistency for nullable dtypes on empty or ALL-NA mean + + # empty series + eser = Series([], dtype=dtype) + result = getattr(eser, method)() + assert result is pd.NA + + # ALL-NA series + nser = Series([np.nan], dtype=dtype) + result = getattr(nser, method)() + assert result is pd.NA + @pytest.mark.parametrize("method", ["mean", "median", "std", "var"]) def test_ops_consistency_on_empty(self, method): @@ -689,16 +706,6 @@ def test_ops_consistency_on_empty(self, method): result = getattr(Series(dtype=float), method)() assert pd.isna(result) - # Nullable dtype on mean - eser = Series([], dtype="Int64") - nser = Series([np.nan], dtype="Int64") - if method == "mean": - result = getattr(eser, method)() - assert result is pd.NA - - result = getattr(nser, method)() - assert result is pd.NA - # timedelta64[ns] tdser = Series([], dtype="m8[ns]") if method == "var": From b036b3b894045c56c60e7bf5e0fe63743da7ee72 Mon Sep 17 00:00:00 2001 From: Akshat Jain Date: Sun, 13 Dec 2020 20:55:05 +0530 Subject: [PATCH 12/16] removed objects dtype --- pandas/tests/reductions/test_reductions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 88da618e1d4c5..8ed316353830a 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -680,7 +680,7 @@ def test_empty_multi(self, method, unit): tm.assert_series_equal(result, expected) @pytest.mark.parametrize("method", ["mean"]) - @pytest.mark.parametrize("dtype", ["float64", "Int64", "boolean", "object"]) + @pytest.mark.parametrize("dtype", ["float64", "Int64", "boolean"]) def test_ops_consistency_mean(self, method, dtype): # GH#34814 From ebf88035d42e9f0e92239ccc37fc921658d456ec Mon Sep 17 00:00:00 2001 From: Akshat Jain Date: Sun, 13 Dec 2020 23:25:35 +0530 Subject: [PATCH 13/16] float64 to Float64 renamed, func renamed --- pandas/tests/reductions/test_reductions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 8ed316353830a..cc5dc675c36e6 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -680,8 +680,8 @@ def test_empty_multi(self, method, unit): tm.assert_series_equal(result, expected) @pytest.mark.parametrize("method", ["mean"]) - @pytest.mark.parametrize("dtype", ["float64", "Int64", "boolean"]) - def test_ops_consistency_mean(self, method, dtype): + @pytest.mark.parametrize("dtype", ["Float64", "Int64", "boolean"]) + def test_ops_consistency_on_empty_nullable(self, method, dtype): # GH#34814 # consistency for nullable dtypes on empty or ALL-NA mean From f9b17f710c8eb343f3e647b9553746ef8bc956d0 Mon Sep 17 00:00:00 2001 From: Akshat Jain Date: Fri, 25 Dec 2020 00:41:36 +0530 Subject: [PATCH 14/16] Added entry in What's new v1.3.0 --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index c7573ee860744..8ae0cd2bb924d 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -160,7 +160,7 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Performance improvement in :meth:`IntervalIndex.isin` (:issue:`38353`) -- +- Performance improvement in :meth:`array.mean` (:issue:`34814`) - .. --------------------------------------------------------------------------- From c02d6ac0469de897cd7b457cf879ef76632dad78 Mon Sep 17 00:00:00 2001 From: Akshat Jain Date: Sat, 26 Dec 2020 18:08:53 +0530 Subject: [PATCH 15/16] changed array.mean to Series.mean with ExtensionDtype columns --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 8ae0cd2bb924d..40d52eaacadca 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -160,7 +160,7 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Performance improvement in :meth:`IntervalIndex.isin` (:issue:`38353`) -- Performance improvement in :meth:`array.mean` (:issue:`34814`) +- Performance improvement in :meth:`Series.mean` for ``ExtensionDtype`` columns (:issue:`34814`) - .. --------------------------------------------------------------------------- From f8e80a1d243f248365a024614d346a20c18a975e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 28 Dec 2020 14:40:44 +0100 Subject: [PATCH 16/16] Update doc/source/whatsnew/v1.3.0.rst --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 40d52eaacadca..2d82ffd95adb6 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -160,7 +160,7 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Performance improvement in :meth:`IntervalIndex.isin` (:issue:`38353`) -- Performance improvement in :meth:`Series.mean` for ``ExtensionDtype`` columns (:issue:`34814`) +- Performance improvement in :meth:`Series.mean` for nullable data types (:issue:`34814`) - .. ---------------------------------------------------------------------------