From 99234602debba7e84c61756b4f201228227a3c71 Mon Sep 17 00:00:00 2001 From: Philipp Hoffmann Date: Sat, 23 Mar 2024 16:34:41 +0100 Subject: [PATCH 01/11] add check for not completely unstable parameter range --- pandas/core/nanops.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 6cb825e9b79a2..04aa4f24eb1df 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -2,6 +2,7 @@ import functools import itertools +from math import frexp from typing import ( Any, Callable, @@ -1358,8 +1359,14 @@ def nankurt( # # #18044 in _libs/windows.pyx calc_kurt follow this behavior # to fix the fperr to treat denom <1e-14 as zero - numerator = _zero_out_fperr(numerator) - denominator = _zero_out_fperr(denominator) + is_zero_out = True + if count < 100: + _, numexp = frexp(numerator) + _, denomexp = frexp(denominator) + is_zero_out = abs(numexp - denomexp) > 24 + if is_zero_out: + numerator = _zero_out_fperr(numerator) + denominator = _zero_out_fperr(denominator) if not isinstance(denominator, np.ndarray): # if ``denom`` is a scalar, check these corner cases first before From d413792892f16b5d7e0a75e99a0c31262f33aacb Mon Sep 17 00:00:00 2001 From: Philipp Hoffmann Date: Tue, 26 Mar 2024 23:31:37 +0100 Subject: [PATCH 02/11] refine cutoff for small arrays --- pandas/core/nanops.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 04aa4f24eb1df..6d068c6cc1ffc 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -2,7 +2,6 @@ import functools import itertools -from math import frexp from typing import ( Any, Callable, @@ -1358,15 +1357,13 @@ def nankurt( # floating point error # # #18044 in _libs/windows.pyx calc_kurt follow this behavior - # to fix the fperr to treat denom <1e-14 as zero - is_zero_out = True - if count < 100: - _, numexp = frexp(numerator) - _, denomexp = frexp(denominator) - is_zero_out = abs(numexp - denomexp) > 24 - if is_zero_out: - numerator = _zero_out_fperr(numerator) - denominator = _zero_out_fperr(denominator) + # to fix the fperr to treat denom <1e-14 as zero (default cutoff) + # GH-57972 set cutoff lower for small arrays to prevent cutoff of otherwise + # numerically stable values + length = count[0] if isinstance(count, np.ndarray) else count + cutoff = 1e-14 if length > 100 else 1e-16 + numerator = _zero_out_fperr(numerator, cutoff) + denominator = _zero_out_fperr(denominator, cutoff) if not isinstance(denominator, np.ndarray): # if ``denom`` is a scalar, check these corner cases first before @@ -1582,12 +1579,12 @@ def check_below_min_count( return False -def _zero_out_fperr(arg): +def _zero_out_fperr(arg, cutoff=1e-14): # #18044 reference this behavior to fix rolling skew/kurt issue if isinstance(arg, np.ndarray): - return np.where(np.abs(arg) < 1e-14, 0, arg) + return np.where(np.abs(arg) < cutoff, 0, arg) else: - return arg.dtype.type(0) if np.abs(arg) < 1e-14 else arg + return arg.dtype.type(0) if np.abs(arg) < cutoff else arg @disallow("M8", "m8") From 15aa86ef9b3d15f5583be7ff5fe8af59ca44cf9a Mon Sep 17 00:00:00 2001 From: Philipp Hoffmann Date: Tue, 26 Mar 2024 23:51:04 +0100 Subject: [PATCH 03/11] add cutoff in cython --- pandas/_libs/window/aggregations.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 6365c030b695b..1b5d8a80f63a8 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -508,7 +508,7 @@ cdef float64_t calc_skew(int64_t minp, int64_t nobs, # if the variance is less than 1e-14, it could be # treat as zero, here we follow the original # skew/kurt behaviour to check B <= 1e-14 - elif B <= 1e-14: + elif B <= 1e-14 and nobs > 100 or B <= 1e-16: result = NaN else: R = sqrt(B) From 79e33818b069a5cf9afba6767f222a554b0ef18a Mon Sep 17 00:00:00 2001 From: Philipp Hoffmann Date: Tue, 26 Mar 2024 23:51:29 +0100 Subject: [PATCH 04/11] add test --- pandas/tests/test_nanops.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index ed125ece349a9..fc8d53e2129ac 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -1095,6 +1095,18 @@ def test_nans_skipna(self, samples, actual_kurt): kurt = nanops.nankurt(samples, skipna=True) tm.assert_almost_equal(kurt, actual_kurt) + def test_small_arrays_with_low_variance(self): + # GH-57972 + # small sample arrays with low variance have a lower threshold for breakdown + # of numerical stability and should be handled accordingly + low_var_samples = np.array( + [-2.05191341e-05] + [0.0e00] * 4 + [-4.10391103e-05] + [0.0e00] * 23 + ) + # calculated with scipy.status kurtosis(low_var_samples, bias=False) + scipy_kurt = 18.087646853025614 + kurt = nanops.nankurt(low_var_samples) + tm.assert_almost_equal(kurt, scipy_kurt) + @property def prng(self): return np.random.default_rng(2) From 2e674167b9fd882ae4b57f2393710b24fbadf329 Mon Sep 17 00:00:00 2001 From: Philipp Hoffmann Date: Tue, 26 Mar 2024 23:56:08 +0100 Subject: [PATCH 05/11] add comment and code path --- pandas/_libs/window/aggregations.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 1b5d8a80f63a8..e377c83dd7192 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -508,6 +508,7 @@ cdef float64_t calc_skew(int64_t minp, int64_t nobs, # if the variance is less than 1e-14, it could be # treat as zero, here we follow the original # skew/kurt behaviour to check B <= 1e-14 + # #57972: for small arrays the cutoff can be lowered elif B <= 1e-14 and nobs > 100 or B <= 1e-16: result = NaN else: @@ -712,7 +713,8 @@ cdef float64_t calc_kurt(int64_t minp, int64_t nobs, # if the variance is less than 1e-14, it could be # treat as zero, here we follow the original # skew/kurt behaviour to check B <= 1e-14 - if B <= 1e-14: + # #57972: for small arrays the cutoff can be lowered + if B <= 1e-14 and nobs > 100 or B <= 1e-16: result = NaN else: K = (dnobs * dnobs - 1.) * D / (B * B) - 3 * ((dnobs - 1.) ** 2) From b030d9b29e2949b2137a8e691f4a9d2bd05c0c8d Mon Sep 17 00:00:00 2001 From: Philipp Hoffmann Date: Wed, 27 Mar 2024 00:03:51 +0100 Subject: [PATCH 06/11] add whatsnew --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 4b7b075ceafaf..2da831b6f858f 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -324,6 +324,7 @@ Bug fixes - Fixed bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`) - Fixed bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) - Fixed bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) +- Fixed bug in :meth:`Series.rolling.skew` and :meth:`rolling.kurt` with small sized values arrays with low variance getting zeroed out even when numerically stable (:issue:`57972`) Categorical ^^^^^^^^^^^ From 6c807cd1665a243382585687e69cb5053585db2b Mon Sep 17 00:00:00 2001 From: Philipp Hoffmann Date: Wed, 27 Mar 2024 00:09:39 +0100 Subject: [PATCH 07/11] remove skew --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/_libs/window/aggregations.pyx | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 2da831b6f858f..93759feef4b3e 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -324,7 +324,7 @@ Bug fixes - Fixed bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`) - Fixed bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) - Fixed bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) -- Fixed bug in :meth:`Series.rolling.skew` and :meth:`rolling.kurt` with small sized values arrays with low variance getting zeroed out even when numerically stable (:issue:`57972`) +- Fixed bug in :meth:`Series.rolling.kurt` with small sized values arrays with low variance getting zeroed out even when numerically stable (:issue:`57972`) Categorical ^^^^^^^^^^^ diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index e377c83dd7192..27d2aae48579f 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -508,8 +508,7 @@ cdef float64_t calc_skew(int64_t minp, int64_t nobs, # if the variance is less than 1e-14, it could be # treat as zero, here we follow the original # skew/kurt behaviour to check B <= 1e-14 - # #57972: for small arrays the cutoff can be lowered - elif B <= 1e-14 and nobs > 100 or B <= 1e-16: + elif B <= 1e-14: result = NaN else: R = sqrt(B) From 7a65c8b58599620699c8d5d5fb32addfcfa37577 Mon Sep 17 00:00:00 2001 From: Philipp Hoffmann Date: Mon, 8 Apr 2024 01:05:29 +0200 Subject: [PATCH 08/11] update threshold --- pandas/_libs/window/aggregations.pyx | 2 +- pandas/core/nanops.py | 6 +++--- pandas/tests/test_nanops.py | 15 +++++++++------ 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 27d2aae48579f..7311345ed5d93 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -713,7 +713,7 @@ cdef float64_t calc_kurt(int64_t minp, int64_t nobs, # treat as zero, here we follow the original # skew/kurt behaviour to check B <= 1e-14 # #57972: for small arrays the cutoff can be lowered - if B <= 1e-14 and nobs > 100 or B <= 1e-16: + if B <= 1e-281: result = NaN else: K = (dnobs * dnobs - 1.) * D / (B * B) - 3 * ((dnobs - 1.) ** 2) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 4a16b09439ee9..df2fb294845e1 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1359,9 +1359,9 @@ def nankurt( # #18044 in _libs/windows.pyx calc_kurt follow this behavior # to fix the fperr to treat denom <1e-14 as zero (default cutoff) # GH-57972 set cutoff lower for small arrays to prevent cutoff of otherwise - # numerically stable values - length = count[0] if isinstance(count, np.ndarray) else count - cutoff = 1e-14 if length > 100 else 1e-16 + # numerically stable values. Scipy.kurtosis and this implementation start + # diverging for examples with cutoffs below e-281 + cutoff = 1e-281 numerator = _zero_out_fperr(numerator, cutoff) denominator = _zero_out_fperr(denominator, cutoff) diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 537de5832ed23..f3a557eb5e5dc 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -1105,16 +1105,19 @@ def test_nans_skipna(self, samples, actual_kurt): kurt = nanops.nankurt(samples, skipna=True) tm.assert_almost_equal(kurt, actual_kurt) - def test_small_arrays_with_low_variance(self): + def test_arrays_with_low_variance(self): # GH-57972 # small sample arrays with low variance have a lower threshold for breakdown # of numerical stability and should be handled accordingly - low_var_samples = np.array( - [-2.05191341e-05] + [0.0e00] * 4 + [-4.10391103e-05] + [0.0e00] * 23 - ) + n = 10_000 + n2 = 10 + # scipy.kurt is nan at e-81, + # both kurtosis start diverging from each other around e-76 + scale = 1e-72 + low_var = np.array([-2.3 * scale] * n2 + [-4.1 * scale] * n2 + [0.0] * n) # calculated with scipy.status kurtosis(low_var_samples, bias=False) - scipy_kurt = 18.087646853025614 - kurt = nanops.nankurt(low_var_samples) + scipy_kurt = 632.556235239126 + kurt = nanops.nankurt(low_var) tm.assert_almost_equal(kurt, scipy_kurt) @property From ac6858784157871533cfa5cd70bb30aef61f079f Mon Sep 17 00:00:00 2001 From: Philipp Hoffmann Date: Mon, 8 Apr 2024 01:13:15 +0200 Subject: [PATCH 09/11] update comments --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/_libs/window/aggregations.pyx | 2 +- pandas/core/nanops.py | 2 +- pandas/tests/test_nanops.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 93759feef4b3e..f67189759cb42 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -324,7 +324,7 @@ Bug fixes - Fixed bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`) - Fixed bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) - Fixed bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) -- Fixed bug in :meth:`Series.rolling.kurt` with small sized values arrays with low variance getting zeroed out even when numerically stable (:issue:`57972`) +- Fixed bug in :meth:`Series.rolling.kurt` with low variance arrays getting zeroed out even when numerically stable (:issue:`57972`) Categorical ^^^^^^^^^^^ diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 7311345ed5d93..141ce2c835310 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -712,7 +712,7 @@ cdef float64_t calc_kurt(int64_t minp, int64_t nobs, # if the variance is less than 1e-14, it could be # treat as zero, here we follow the original # skew/kurt behaviour to check B <= 1e-14 - # #57972: for small arrays the cutoff can be lowered + # #57972: for non-zero but low variance arrays the cutoff can be lowered if B <= 1e-281: result = NaN else: diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index df2fb294845e1..7d6febedaa3d3 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1358,7 +1358,7 @@ def nankurt( # # #18044 in _libs/windows.pyx calc_kurt follow this behavior # to fix the fperr to treat denom <1e-14 as zero (default cutoff) - # GH-57972 set cutoff lower for small arrays to prevent cutoff of otherwise + # GH-57972 set cutoff lower for low variance arrays to prevent cutoff of otherwise # numerically stable values. Scipy.kurtosis and this implementation start # diverging for examples with cutoffs below e-281 cutoff = 1e-281 diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index f3a557eb5e5dc..a74a97d351fda 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -1107,7 +1107,7 @@ def test_nans_skipna(self, samples, actual_kurt): def test_arrays_with_low_variance(self): # GH-57972 - # small sample arrays with low variance have a lower threshold for breakdown + # sample arrays with low variance have a lower threshold for breakdown # of numerical stability and should be handled accordingly n = 10_000 n2 = 10 From c96fc4fced3ea8b4b118e3bd4b5a205d0583f9f4 Mon Sep 17 00:00:00 2001 From: Philipp Hoffmann Date: Tue, 9 Apr 2024 11:20:35 +0200 Subject: [PATCH 10/11] Update v3.0.0.rst --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index f67189759cb42..d189f85a7e9dc 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -324,7 +324,7 @@ Bug fixes - Fixed bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`) - Fixed bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) - Fixed bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) -- Fixed bug in :meth:`Series.rolling.kurt` with low variance arrays getting zeroed out even when numerically stable (:issue:`57972`) +- Fixed bug in :meth:`Series.kurt` with low variance arrays getting zeroed out even when numerically stable (:issue:`57972`) Categorical ^^^^^^^^^^^ From af135130ac030fcf698c56e4f63e3873727551a7 Mon Sep 17 00:00:00 2001 From: Philipp Hoffmann Date: Tue, 9 Apr 2024 11:21:24 +0200 Subject: [PATCH 11/11] Revert aggregations.pyx --- pandas/_libs/window/aggregations.pyx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 141ce2c835310..6365c030b695b 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -712,8 +712,7 @@ cdef float64_t calc_kurt(int64_t minp, int64_t nobs, # if the variance is less than 1e-14, it could be # treat as zero, here we follow the original # skew/kurt behaviour to check B <= 1e-14 - # #57972: for non-zero but low variance arrays the cutoff can be lowered - if B <= 1e-281: + if B <= 1e-14: result = NaN else: K = (dnobs * dnobs - 1.) * D / (B * B) - 3 * ((dnobs - 1.) ** 2)