From c8153c99f4f1551ff6258b80168fb31ec3e9c6d3 Mon Sep 17 00:00:00 2001 From: parthiban Date: Thu, 27 Apr 2023 20:55:09 +0530 Subject: [PATCH 01/10] BUG: Fix np.inf + np.nan sum issue on groupby mean --- pandas/_libs/groupby.pyx | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 406b1a0f1f807..c9729af3eff57 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -54,6 +54,8 @@ cdef int64_t NPY_NAT = util.get_nat() cdef float64_t NaN = np.NaN +cdef float64_t Infinity = np.Inf + cdef enum InterpolationEnumType: INTERPOLATION_LINEAR, INTERPOLATION_LOWER, @@ -1073,7 +1075,10 @@ def group_mean( if not isna_entry: nobs[lab, j] += 1 y = val - compensation[lab, j] - t = sumx[lab, j] + y + if sumx[lab, j] == Infinity: + t = Infinity + else: + t = sumx[lab, j] + y compensation[lab, j] = t - sumx[lab, j] - y sumx[lab, j] = t From 575e80aa5e88f1869f23dcf118fda9d281efae7e Mon Sep 17 00:00:00 2001 From: parthiban Date: Fri, 28 Apr 2023 07:26:58 +0530 Subject: [PATCH 02/10] BUG: Change variable name --- pandas/_libs/groupby.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index c9729af3eff57..d639394b7ca68 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -54,7 +54,7 @@ cdef int64_t NPY_NAT = util.get_nat() cdef float64_t NaN = np.NaN -cdef float64_t Infinity = np.Inf +cdef float64_t INF = np.Inf cdef enum InterpolationEnumType: INTERPOLATION_LINEAR, @@ -1075,8 +1075,8 @@ def group_mean( if not isna_entry: nobs[lab, j] += 1 y = val - compensation[lab, j] - if sumx[lab, j] == Infinity: - t = Infinity + if sumx[lab, j] == INF: + t = INF else: t = sumx[lab, j] + y compensation[lab, j] = t - sumx[lab, j] - y From 60a0279ba52214f9ca9e6fb345e755aa4cf1e193 Mon Sep 17 00:00:00 2001 From: parthiban Date: Fri, 28 Apr 2023 07:27:41 +0530 Subject: [PATCH 03/10] TST: add test case to validate the fix --- pandas/tests/groupby/test_libgroupby.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/tests/groupby/test_libgroupby.py b/pandas/tests/groupby/test_libgroupby.py index 9552d67bfe992..573443a55cbd1 100644 --- a/pandas/tests/groupby/test_libgroupby.py +++ b/pandas/tests/groupby/test_libgroupby.py @@ -282,3 +282,20 @@ def test_cython_group_mean_not_datetimelike_but_has_NaT_values(): tm.assert_numpy_array_equal( actual[:, 0], np.array(np.divide(np.add(data[0], data[1]), 2), dtype="float64") ) + + +def test_cython_group_mean_Inf_at_begining_and_end(): + # GH 50367 + actual = np.array([[np.nan, np.nan], [np.nan, np.nan]], dtype="float64") + counts = np.array([0, 0]) + data = np.array( + [[np.inf, 1.0], [1.0, 2.0], [2.0, 3.0], [3.0, 4.0], [4.0, 5.0], [5, np.inf]], + dtype="float64", + ) + labels = np.array([0, 1, 0, 1, 0, 1], dtype="int64") + + group_mean(actual, counts, data, labels, is_datetimelike=False) + + tm.assert_numpy_array_equal( + actual, np.array([[np.inf, 3], [3, np.inf]], dtype="float64") + ) From 323dd11c4c9ad37949bbba17b2d83b63385c8931 Mon Sep 17 00:00:00 2001 From: parthiban Date: Mon, 1 May 2023 15:02:26 +0530 Subject: [PATCH 04/10] Bug: Set Compensation to 0 when it is NaN --- pandas/_libs/groupby.pyx | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index d639394b7ca68..85296f38369ad 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -54,8 +54,6 @@ cdef int64_t NPY_NAT = util.get_nat() cdef float64_t NaN = np.NaN -cdef float64_t INF = np.Inf - cdef enum InterpolationEnumType: INTERPOLATION_LINEAR, INTERPOLATION_LOWER, @@ -1075,11 +1073,10 @@ def group_mean( if not isna_entry: nobs[lab, j] += 1 y = val - compensation[lab, j] - if sumx[lab, j] == INF: - t = INF - else: - t = sumx[lab, j] + y + t = sumx[lab, j] + y compensation[lab, j] = t - sumx[lab, j] - y + if compensation[lab, j] != compensation[lab, j] : + compensation[lab, j] = 0. sumx[lab, j] = t for i in range(ncounts): From 1fc174e9b847496a78cad785c06c66f0e2853095 Mon Sep 17 00:00:00 2001 From: parthiban Date: Mon, 1 May 2023 19:33:33 +0530 Subject: [PATCH 05/10] TST: Fix failing test --- pandas/tests/groupby/test_libgroupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_libgroupby.py b/pandas/tests/groupby/test_libgroupby.py index 573443a55cbd1..6227ffcc414fe 100644 --- a/pandas/tests/groupby/test_libgroupby.py +++ b/pandas/tests/groupby/test_libgroupby.py @@ -287,7 +287,7 @@ def test_cython_group_mean_not_datetimelike_but_has_NaT_values(): def test_cython_group_mean_Inf_at_begining_and_end(): # GH 50367 actual = np.array([[np.nan, np.nan], [np.nan, np.nan]], dtype="float64") - counts = np.array([0, 0]) + counts = np.array([0, 0], dtype="int64") data = np.array( [[np.inf, 1.0], [1.0, 2.0], [2.0, 3.0], [3.0, 4.0], [4.0, 5.0], [5, np.inf]], dtype="float64", From dea7d19a6a0e75d74c0b8cff072b6ef847cd1b3c Mon Sep 17 00:00:00 2001 From: parthiban Date: Tue, 2 May 2023 08:02:07 +0530 Subject: [PATCH 06/10] Remove Space --- pandas/_libs/groupby.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 85296f38369ad..437fa9c2b9fc7 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1075,7 +1075,7 @@ def group_mean( y = val - compensation[lab, j] t = sumx[lab, j] + y compensation[lab, j] = t - sumx[lab, j] - y - if compensation[lab, j] != compensation[lab, j] : + if compensation[lab, j] != compensation[lab, j]: compensation[lab, j] = 0. sumx[lab, j] = t From 6f3e64ee8d07831002f3dd7c0d80d0e98de027ff Mon Sep 17 00:00:00 2001 From: parthiban Date: Wed, 3 May 2023 12:37:18 +0530 Subject: [PATCH 07/10] Add Comments --- pandas/_libs/groupby.pyx | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 437fa9c2b9fc7..c05d963037e99 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1076,6 +1076,11 @@ def group_mean( t = sumx[lab, j] + y compensation[lab, j] = t - sumx[lab, j] - y if compensation[lab, j] != compensation[lab, j]: + # GH#50367 + # if val is +/- infinity, this (t-sumx[lab, j]-y) expression + # will evaluate to Nan. So we are checking whether + # compensation[lab, j] is Nan and setting it to 0. + # we cannot use util.is_nan because of no gil compensation[lab, j] = 0. sumx[lab, j] = t From cf50206d17d0c8c243b8a5b9221daca87957b143 Mon Sep 17 00:00:00 2001 From: parthiban Date: Wed, 3 May 2023 13:46:43 +0530 Subject: [PATCH 08/10] TST: assign expected to seperate variable --- pandas/tests/groupby/test_libgroupby.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_libgroupby.py b/pandas/tests/groupby/test_libgroupby.py index 6227ffcc414fe..1d94a2e5130a6 100644 --- a/pandas/tests/groupby/test_libgroupby.py +++ b/pandas/tests/groupby/test_libgroupby.py @@ -296,6 +296,9 @@ def test_cython_group_mean_Inf_at_begining_and_end(): group_mean(actual, counts, data, labels, is_datetimelike=False) + expected = np.array([[np.inf, 3], [3, np.inf]], dtype="float64") + tm.assert_numpy_array_equal( - actual, np.array([[np.inf, 3], [3, np.inf]], dtype="float64") + actual, + expected, ) From ac407dd888f82c961dbd170d436a9abd748abef3 Mon Sep 17 00:00:00 2001 From: parthiban Date: Thu, 4 May 2023 08:45:38 +0530 Subject: [PATCH 09/10] Update comment --- pandas/_libs/groupby.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index c05d963037e99..0636dabc0b176 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1077,10 +1077,10 @@ def group_mean( compensation[lab, j] = t - sumx[lab, j] - y if compensation[lab, j] != compensation[lab, j]: # GH#50367 - # if val is +/- infinity, this (t-sumx[lab, j]-y) expression - # will evaluate to Nan. So we are checking whether - # compensation[lab, j] is Nan and setting it to 0. - # we cannot use util.is_nan because of no gil + # If val is +/- infinity, compensation is NaN + # which would lead to results being NaN instead + # of +/-infinity. We cannot use util.is_nan + # because of no gil compensation[lab, j] = 0. sumx[lab, j] = t From 3c02170ff6ca51e9ab3e78b37a340939248e8bf3 Mon Sep 17 00:00:00 2001 From: parthiban Date: Wed, 10 May 2023 17:48:15 +0530 Subject: [PATCH 10/10] TST: Fix issue with Linux32 dtype ValueError --- pandas/tests/groupby/test_libgroupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_libgroupby.py b/pandas/tests/groupby/test_libgroupby.py index 1d94a2e5130a6..d10bcf9053d1a 100644 --- a/pandas/tests/groupby/test_libgroupby.py +++ b/pandas/tests/groupby/test_libgroupby.py @@ -292,7 +292,7 @@ def test_cython_group_mean_Inf_at_begining_and_end(): [[np.inf, 1.0], [1.0, 2.0], [2.0, 3.0], [3.0, 4.0], [4.0, 5.0], [5, np.inf]], dtype="float64", ) - labels = np.array([0, 1, 0, 1, 0, 1], dtype="int64") + labels = np.array([0, 1, 0, 1, 0, 1], dtype=np.intp) group_mean(actual, counts, data, labels, is_datetimelike=False)