From 975de8433bb0941675a69c6eabbdaf8ec8632277 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 24 Dec 2020 09:33:04 -0500 Subject: [PATCH 01/11] BUG: 2d rank infs and nans --- pandas/_libs/algos.pyx | 47 ++++++------- pandas/tests/frame/methods/test_rank.py | 87 +++++++++++++++++++++++++ 2 files changed, 108 insertions(+), 26 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 4cddd49381a83..76a96f769b2e4 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1067,6 +1067,8 @@ def rank_2d( mask = values == NPY_NAT np.putmask(values, mask, nan_value) + else: + mask = np.zeros_like(values, dtype=bool) n, k = (values).shape ranks = np.empty((n, k), dtype='f8') @@ -1097,45 +1099,36 @@ def rank_2d( values = _take_2d(values, _as) argsorted = _as.astype('i8') - for i in range(n): - if rank_t is object: - dups = sum_ranks = infs = 0 - else: - dups = sum_ranks = 0 + dups = sum_ranks = infs = 0 total_tie_count = 0 count = 0.0 for j in range(k): - if rank_t is not object: - sum_ranks += j + 1 - dups += 1 - val = values[i, j] - if rank_t is not uint64_t: - if rank_t is object: - skip_condition = (val is nan_value) and keep_na - else: - skip_condition = (val == nan_value) and keep_na - if skip_condition: - ranks[i, argsorted[i, j]] = NaN - - if rank_t is object: - infs += 1 - - continue + if mask[i, argsorted[i, j]] and keep_na: + ranks[i, argsorted[i, j]] = NaN + infs += 1 + continue count += 1.0 - if rank_t is object: - sum_ranks += (j - infs) + 1 - dups += 1 + sum_ranks += (j - infs) + 1 + dups += 1 if rank_t is object: - condition = j == k - 1 or are_diff(values[i, j + 1], val) + condition = ( + j == k - 1 or + are_diff(values[i, j + 1], val) or + (mask[i, argsorted[i, j + 1]] and keep_na) + ) else: - condition = j == k - 1 or values[i, j + 1] != val + condition = ( + j == k - 1 or + values[i, j + 1] != val or + (mask[i, argsorted[i, j + 1]] and keep_na) + ) if condition: if tiebreak == TIEBREAK_AVERAGE: @@ -1161,11 +1154,13 @@ def rank_2d( for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = total_tie_count sum_ranks = dups = 0 + if pct: if tiebreak == TIEBREAK_DENSE: ranks[i, :] /= total_tie_count else: ranks[i, :] /= count + if axis == 0: return ranks.T else: diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index bab2db3192b4a..fa0db7c3610d3 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._libs import iNaT +from pandas._libs.algos import Infinity, NegInfinity import pandas.util._test_decorators as td from pandas import DataFrame, Series @@ -329,3 +331,88 @@ def test_pct_max_many_rows(self): ) result = df.rank(pct=True).max() assert (result == 1).all() + + @pytest.mark.parametrize( + "contents,dtype", + [ + ( + [ + -np.inf, + -50, + -1, + -1e-20, + -1e-25, + -1e-50, + 0, + 1e-40, + 1e-20, + 1e-10, + 2, + 40, + np.inf, + ], + "float64", + ), + ( + [ + -np.inf, + -50, + -1, + -1e-20, + -1e-25, + -1e-45, + 0, + 1e-40, + 1e-20, + 1e-10, + 2, + 40, + np.inf, + ], + "float32", + ), + ([np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max], "uint8"), + pytest.param( + [ + np.iinfo(np.int64).min, + -100, + 0, + 1, + 9999, + 100000, + 1e10, + np.iinfo(np.int64).max, + ], + "int64", + marks=pytest.mark.xfail( + reason="iNaT is equivalent to minimum value of dtype" + "int64 pending issue GH#16674" + ), + ), + ([NegInfinity(), "1", "A", "BA", "Ba", "C", Infinity()], "object"), + ], + ) + def test_rank_inf_and_nan(self, contents, dtype): + dtype_na_map = { + "float64": np.nan, + "float32": np.nan, + "int64": iNaT, + "object": None, + } + # Insert nans at random positions if underlying dtype has missing + # value. Then adjust the expected order by adding nans accordingly + # This is for testing whether rank calculation is affected + # when values are interwined with nan values. + values = np.array(contents, dtype=dtype) + exp_order = np.array(range(len(values)), dtype="float64") + 1.0 + if dtype in dtype_na_map: + na_value = dtype_na_map[dtype] + nan_indices = np.random.choice(range(len(values)), 5) + values = np.insert(values, nan_indices, na_value) + exp_order = np.insert(exp_order, nan_indices, np.nan) + # shuffle the testing array and expected results in the same way + random_order = np.random.permutation(len(values)) + df = DataFrame({"a": values[random_order]}) + expected = DataFrame({"a": exp_order[random_order]}, dtype="float64") + result = df.rank() + tm.assert_frame_equal(result, expected) From fd358b1164aa0248ce5ca058697f0415a76ae06a Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 24 Dec 2020 09:44:45 -0500 Subject: [PATCH 02/11] Add whatsnew --- doc/source/whatsnew/v1.3.0.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index c3d896166fabe..d098b72ad998e 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -201,6 +201,9 @@ Numeric ^^^^^^^ - Bug in :meth:`DataFrame.quantile`, :meth:`DataFrame.sort_values` causing incorrect subsequent indexing behavior (:issue:`38351`) - Bug in :meth:`DataFrame.select_dtypes` with ``include=np.number`` now retains numeric ``ExtensionDtype`` columns (:issue:`35340`) +- Bug in :meth:`DataFrame.rank` with ``np.inf`` and mixture of ``np.nan`` and ``np.inf`` (:issue:`32593`) +- +- Conversion ^^^^^^^^^^ From 256ff8441072f72ba3e368a4169d21b817a2c860 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 24 Dec 2020 09:48:40 -0500 Subject: [PATCH 03/11] Remove added blank line --- pandas/_libs/algos.pyx | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 76a96f769b2e4..d87e7c3461e70 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1154,13 +1154,11 @@ def rank_2d( for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = total_tie_count sum_ranks = dups = 0 - if pct: if tiebreak == TIEBREAK_DENSE: ranks[i, :] /= total_tie_count else: ranks[i, :] /= count - if axis == 0: return ranks.T else: From f7bc05852723ef0f1e2f6309bd81fd9980cce4e1 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 24 Dec 2020 09:49:29 -0500 Subject: [PATCH 04/11] Add back other blank line --- pandas/_libs/algos.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index d87e7c3461e70..10501937e47aa 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1099,6 +1099,7 @@ def rank_2d( values = _take_2d(values, _as) argsorted = _as.astype('i8') + for i in range(n): dups = sum_ranks = infs = 0 From 39df107bea5648b667c5313b5bec1c14aa7d74d3 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 24 Dec 2020 09:50:22 -0500 Subject: [PATCH 05/11] Clean diff --- pandas/_libs/algos.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 10501937e47aa..b4ffea335a036 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1099,7 +1099,7 @@ def rank_2d( values = _take_2d(values, _as) argsorted = _as.astype('i8') - + for i in range(n): dups = sum_ranks = infs = 0 From 7abd460f37883dc66f82e0abd3e9708deea99462 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Thu, 24 Dec 2020 19:33:20 -0500 Subject: [PATCH 06/11] Add tests from OP --- pandas/tests/frame/methods/test_rank.py | 28 +++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index fa0db7c3610d3..e781ccb4a8734 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -416,3 +416,31 @@ def test_rank_inf_and_nan(self, contents, dtype): expected = DataFrame({"a": exp_order[random_order]}, dtype="float64") result = df.rank() tm.assert_frame_equal(result, expected) + + def test_df_series_inf_nan_consistency(self): + # GH#32593 + index = [5, 4, 3, 2, 1, 6, 7, 8, 9, 10] + col1 = [5, 4, 3, 5, 8, 5, 2, 1, 6, 6] + col2 = [5, 4, np.nan, 5, 8, 5, np.inf, np.nan, 6, -np.inf] + df = DataFrame( + index=index, + data={ + "col1": col1, + "col2": col2, + }, + dtype="f8", + ) + df_result = df.rank() + + series_result = df.copy() + series_result["col1"] = df["col1"].rank() + series_result["col2"] = df["col2"].rank() + + tm.assert_frame_equal(df_result, series_result) + + def test_rank_both_inf(self): + # GH#32593 + df = DataFrame({"a": [-np.inf, 0, np.inf]}) + expected = DataFrame({"a": [1.0, 2.0, 3.0]}) + result = df.rank() + tm.assert_frame_equal(result, expected) From b722d19e528a6c0b1dfafa3eb67858c78ec733e9 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Wed, 30 Dec 2020 10:58:19 -0500 Subject: [PATCH 07/11] Avoid repeated lookup --- pandas/_libs/algos.pyx | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index b4ffea335a036..d842bb91390df 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1025,9 +1025,10 @@ def rank_2d( rank_t val, nan_value float64_t sum_ranks = 0 int tiebreak = 0 + int64_t idx bint keep_na = False float64_t count = 0.0 - bint condition, skip_condition + bint condition tiebreak = tiebreakers[ties_method] @@ -1107,9 +1108,9 @@ def rank_2d( count = 0.0 for j in range(k): val = values[i, j] - - if mask[i, argsorted[i, j]] and keep_na: - ranks[i, argsorted[i, j]] = NaN + idx = argsorted[i, j] + if mask[i, idx] and keep_na: + ranks[i, idx] = NaN infs += 1 continue From 9d6ee7f23058c812d1aeac3a35748d71e3c11fc3 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Wed, 30 Dec 2020 11:01:06 -0500 Subject: [PATCH 08/11] Move index after data --- pandas/tests/frame/methods/test_rank.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index e781ccb4a8734..991a91275ae1d 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -423,11 +423,11 @@ def test_df_series_inf_nan_consistency(self): col1 = [5, 4, 3, 5, 8, 5, 2, 1, 6, 6] col2 = [5, 4, np.nan, 5, 8, 5, np.inf, np.nan, 6, -np.inf] df = DataFrame( - index=index, data={ "col1": col1, "col2": col2, }, + index=index, dtype="f8", ) df_result = df.rank() From 36884e673c7557afae852802859cc79c0332b567 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Wed, 30 Dec 2020 11:50:17 -0500 Subject: [PATCH 09/11] Type mask, shortcircuit --- pandas/_libs/algos.pyx | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index d842bb91390df..cb7224bf91f7f 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1022,17 +1022,19 @@ def rank_2d( ndarray[float64_t, ndim=2] ranks ndarray[rank_t, ndim=2] values ndarray[int64_t, ndim=2] argsorted + ndarray[uint8_t, ndim=2] mask rank_t val, nan_value float64_t sum_ranks = 0 int tiebreak = 0 int64_t idx bint keep_na = False float64_t count = 0.0 - bint condition + bint condition, mask_check tiebreak = tiebreakers[ties_method] keep_na = na_option == 'keep' + mask_check = rank_t is not uint64_t if axis == 0: values = np.asarray(in_arr).T.copy() @@ -1109,7 +1111,7 @@ def rank_2d( for j in range(k): val = values[i, j] idx = argsorted[i, j] - if mask[i, idx] and keep_na: + if keep_na and mask_check and mask[i, idx]: ranks[i, idx] = NaN infs += 1 continue @@ -1123,13 +1125,13 @@ def rank_2d( condition = ( j == k - 1 or are_diff(values[i, j + 1], val) or - (mask[i, argsorted[i, j + 1]] and keep_na) + (keep_na and mask_check and mask[i, argsorted[i, j + 1]]) ) else: condition = ( j == k - 1 or values[i, j + 1] != val or - (mask[i, argsorted[i, j + 1]] and keep_na) + (keep_na and mask_check and mask[i, argsorted[i, j + 1]]) ) if condition: From e60aaf0c23044b6687ab0ca6dd47b4dcec098f0b Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Wed, 30 Dec 2020 14:02:39 -0500 Subject: [PATCH 10/11] Remove dup whatsnew --- doc/source/whatsnew/v1.3.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index b082b7809922f..607a14c696578 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -211,7 +211,6 @@ Numeric ^^^^^^^ - Bug in :meth:`DataFrame.quantile`, :meth:`DataFrame.sort_values` causing incorrect subsequent indexing behavior (:issue:`38351`) - Bug in :meth:`DataFrame.select_dtypes` with ``include=np.number`` now retains numeric ``ExtensionDtype`` columns (:issue:`35340`) -- Bug in :meth:`DataFrame.rank` with ``np.inf`` and mixture of ``np.nan`` and ``np.inf`` (:issue:`32593`) - Bug in :meth:`DataFrame.mode` and :meth:`Series.mode` not keeping consistent integer :class:`Index` for empty input (:issue:`33321`) - Bug in :meth:`DataFrame.rank` with ``np.inf`` and mixture of ``np.nan`` and ``np.inf`` (:issue:`32593`) From 732c482ebec0e25f81262c402e63a35a58ebb390 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Wed, 30 Dec 2020 14:05:28 -0500 Subject: [PATCH 11/11] Name check more clearly --- pandas/_libs/algos.pyx | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index cb7224bf91f7f..d97957eea0543 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1029,12 +1029,12 @@ def rank_2d( int64_t idx bint keep_na = False float64_t count = 0.0 - bint condition, mask_check + bint condition, check_mask tiebreak = tiebreakers[ties_method] keep_na = na_option == 'keep' - mask_check = rank_t is not uint64_t + check_mask = rank_t is not uint64_t if axis == 0: values = np.asarray(in_arr).T.copy() @@ -1111,7 +1111,7 @@ def rank_2d( for j in range(k): val = values[i, j] idx = argsorted[i, j] - if keep_na and mask_check and mask[i, idx]: + if keep_na and check_mask and mask[i, idx]: ranks[i, idx] = NaN infs += 1 continue @@ -1125,13 +1125,13 @@ def rank_2d( condition = ( j == k - 1 or are_diff(values[i, j + 1], val) or - (keep_na and mask_check and mask[i, argsorted[i, j + 1]]) + (keep_na and check_mask and mask[i, argsorted[i, j + 1]]) ) else: condition = ( j == k - 1 or values[i, j + 1] != val or - (keep_na and mask_check and mask[i, argsorted[i, j + 1]]) + (keep_na and check_mask and mask[i, argsorted[i, j + 1]]) ) if condition: