From 3858ca322d5d09e60c17103b0e83e961c74e51b2 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sun, 3 Jan 2021 17:10:22 -0500 Subject: [PATCH 1/7] BUG: rank_2d mixed dtype --- pandas/_libs/algos.pyx | 33 +++++-------------------- pandas/tests/frame/methods/test_rank.py | 12 +++++++++ 2 files changed, 18 insertions(+), 27 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 3aa4738b36dc8..96ebb4d826faf 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1112,7 +1112,6 @@ def rank_2d( int tiebreak = 0 int64_t idx bint check_mask, condition, keep_na - const int64_t[:] labels tiebreak = tiebreakers[ties_method] @@ -1158,34 +1157,14 @@ def rank_2d( n, k = (values).shape ranks = np.empty((n, k), dtype='f8') - # For compatibility when calling rank_1d - labels = np.zeros(k, dtype=np.int64) - if rank_t is object: - try: - _as = values.argsort(1) - except TypeError: - values = in_arr - for i in range(len(values)): - ranks[i] = rank_1d( - in_arr[i], - labels=labels, - ties_method=ties_method, - ascending=ascending, - pct=pct - ) - if axis == 0: - return ranks.T - else: - return ranks + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + _as = values.argsort(axis=1, kind='mergesort') + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING else: - if tiebreak == TIEBREAK_FIRST: - # need to use a stable sort here - _as = values.argsort(axis=1, kind='mergesort') - if not ascending: - tiebreak = TIEBREAK_FIRST_DESCENDING - else: - _as = values.argsort(1) + _as = values.argsort(1) if not ascending: _as = _as[:, ::-1] diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 991a91275ae1d..bb22ab133ccad 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -444,3 +444,15 @@ def test_rank_both_inf(self): expected = DataFrame({"a": [1.0, 2.0, 3.0]}) result = df.rank() tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "data,expected", + [ + ({"a": [1, 2, "a"], "b": [4, 5, 6]}, DataFrame({"b": [1.0, 2.0, 3.0]})), + ({"a": [1, 2, "a"]}, DataFrame(index=range(3))), + ], + ) + def test_rank_mixed_axis_zero(self, data, expected): + df = DataFrame(data) + result = df.rank() + tm.assert_frame_equal(result, expected) From 5c495f5b7b51fe2e1b4337d7585be6b65438a263 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sun, 3 Jan 2021 17:19:00 -0500 Subject: [PATCH 2/7] Add whatsnew --- doc/source/whatsnew/v1.3.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 393866b92771b..060788cd608ea 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -216,6 +216,8 @@ Numeric - Bug in :meth:`DataFrame.select_dtypes` with ``include=np.number`` now retains numeric ``ExtensionDtype`` columns (:issue:`35340`) - Bug in :meth:`DataFrame.mode` and :meth:`Series.mode` not keeping consistent integer :class:`Index` for empty input (:issue:`33321`) - Bug in :meth:`DataFrame.rank` with ``np.inf`` and mixture of ``np.nan`` and ``np.inf`` (:issue:`32593`) +- Bug in :meth:`DataFrame.rank` with ``axis=0`` and columns holding incomparable types raising ``IndexError`` +- Conversion ^^^^^^^^^^ From 4c1690cf0e2d1906e7c7956708e65fc4956ce997 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Mon, 4 Jan 2021 13:14:44 -0500 Subject: [PATCH 3/7] Use argsort_indexer --- pandas/_libs/algos.pyx | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 96ebb4d826faf..9cea80336c761 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1105,7 +1105,7 @@ def rank_2d( Py_ssize_t infs ndarray[float64_t, ndim=2] ranks ndarray[rank_t, ndim=2] values - ndarray[int64_t, ndim=2] argsorted + ndarray[int64_t, ndim=2] argsort_indexer ndarray[uint8_t, ndim=2] mask rank_t val, nan_value float64_t count, sum_ranks = 0.0 @@ -1160,17 +1160,16 @@ def rank_2d( if tiebreak == TIEBREAK_FIRST: # need to use a stable sort here - _as = values.argsort(axis=1, kind='mergesort') + argsort_indexer = values.argsort(axis=1, kind='mergesort') if not ascending: tiebreak = TIEBREAK_FIRST_DESCENDING else: - _as = values.argsort(1) + argsort_indexer = values.argsort(1) if not ascending: - _as = _as[:, ::-1] + argsort_indexer = argsort_indexer[:, ::-1] - values = _take_2d(values, _as) - argsorted = _as.astype('i8') + values = _take_2d(values, argsort_indexer) for i in range(n): dups = sum_ranks = infs = 0 @@ -1179,7 +1178,7 @@ def rank_2d( count = 0.0 for j in range(k): val = values[i, j] - idx = argsorted[i, j] + idx = argsort_indexer[i, j] if keep_na and check_mask and mask[i, idx]: ranks[i, idx] = NaN infs += 1 @@ -1194,38 +1193,38 @@ def rank_2d( condition = ( j == k - 1 or are_diff(values[i, j + 1], val) or - (keep_na and check_mask and mask[i, argsorted[i, j + 1]]) + (keep_na and check_mask and mask[i, argsort_indexer[i, j + 1]]) ) else: condition = ( j == k - 1 or values[i, j + 1] != val or - (keep_na and check_mask and mask[i, argsorted[i, j + 1]]) + (keep_na and check_mask and mask[i, argsort_indexer[i, j + 1]]) ) if condition: if tiebreak == TIEBREAK_AVERAGE: for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = sum_ranks / dups + ranks[i, argsort_indexer[i, z]] = sum_ranks / dups elif tiebreak == TIEBREAK_MIN: for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = j - dups + 2 + ranks[i, argsort_indexer[i, z]] = j - dups + 2 elif tiebreak == TIEBREAK_MAX: for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = j + 1 + ranks[i, argsort_indexer[i, z]] = j + 1 elif tiebreak == TIEBREAK_FIRST: if rank_t is object: raise ValueError('first not supported for non-numeric data') else: for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = z + 1 + ranks[i, argsort_indexer[i, z]] = z + 1 elif tiebreak == TIEBREAK_FIRST_DESCENDING: for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 + ranks[i, argsort_indexer[i, z]] = 2 * j - z - dups + 2 elif tiebreak == TIEBREAK_DENSE: total_tie_count += 1 for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = total_tie_count + ranks[i, argsort_indexer[i, z]] = total_tie_count sum_ranks = dups = 0 if pct: if tiebreak == TIEBREAK_DENSE: From 743572b94c372826b2591b9c0b26301cfaf34abd Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Mon, 4 Jan 2021 13:54:33 -0500 Subject: [PATCH 4/7] Add back astype conversion --- pandas/_libs/algos.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 9cea80336c761..a700eb97046d2 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1170,6 +1170,7 @@ def rank_2d( argsort_indexer = argsort_indexer[:, ::-1] values = _take_2d(values, argsort_indexer) + argsort_indexer = argsort_indexer.astype('i8') for i in range(n): dups = sum_ranks = infs = 0 From 0b3d5e5c5aaf3e2e19afb2f90a2601f9a606b542 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Mon, 4 Jan 2021 15:12:33 -0500 Subject: [PATCH 5/7] 32-bit fixup --- pandas/_libs/algos.pyx | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index a700eb97046d2..e985e39171a6e 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1160,17 +1160,16 @@ def rank_2d( if tiebreak == TIEBREAK_FIRST: # need to use a stable sort here - argsort_indexer = values.argsort(axis=1, kind='mergesort') + argsort_indexer = values.argsort(axis=1, kind='mergesort').astype('i8') if not ascending: tiebreak = TIEBREAK_FIRST_DESCENDING else: - argsort_indexer = values.argsort(1) + argsort_indexer = values.argsort(1).astype('i8') if not ascending: argsort_indexer = argsort_indexer[:, ::-1] values = _take_2d(values, argsort_indexer) - argsort_indexer = argsort_indexer.astype('i8') for i in range(n): dups = sum_ranks = infs = 0 From 6edd2b3c74dfb56ad1c990220e4581c9ff20cdcb Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Mon, 4 Jan 2021 16:10:51 -0500 Subject: [PATCH 6/7] Try intp --- pandas/_libs/algos.pyx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index e985e39171a6e..76bfb001cea81 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -26,6 +26,7 @@ from numpy cimport ( int16_t, int32_t, int64_t, + intp_t, ndarray, uint8_t, uint16_t, @@ -1105,7 +1106,7 @@ def rank_2d( Py_ssize_t infs ndarray[float64_t, ndim=2] ranks ndarray[rank_t, ndim=2] values - ndarray[int64_t, ndim=2] argsort_indexer + ndarray[intp_t, ndim=2] argsort_indexer ndarray[uint8_t, ndim=2] mask rank_t val, nan_value float64_t count, sum_ranks = 0.0 @@ -1160,11 +1161,11 @@ def rank_2d( if tiebreak == TIEBREAK_FIRST: # need to use a stable sort here - argsort_indexer = values.argsort(axis=1, kind='mergesort').astype('i8') + argsort_indexer = values.argsort(axis=1, kind='mergesort') if not ascending: tiebreak = TIEBREAK_FIRST_DESCENDING else: - argsort_indexer = values.argsort(1).astype('i8') + argsort_indexer = values.argsort(1) if not ascending: argsort_indexer = argsort_indexer[:, ::-1] From 66504dec9072fe7eeff565bb5622716988d1dd55 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 4 Jan 2021 18:28:05 -0500 Subject: [PATCH 7/7] Update doc/source/whatsnew/v1.3.0.rst --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index f321b13d4fc02..0884065247fbc 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -217,7 +217,7 @@ Numeric - Bug in :meth:`DataFrame.select_dtypes` with ``include=np.number`` now retains numeric ``ExtensionDtype`` columns (:issue:`35340`) - Bug in :meth:`DataFrame.mode` and :meth:`Series.mode` not keeping consistent integer :class:`Index` for empty input (:issue:`33321`) - Bug in :meth:`DataFrame.rank` with ``np.inf`` and mixture of ``np.nan`` and ``np.inf`` (:issue:`32593`) -- Bug in :meth:`DataFrame.rank` with ``axis=0`` and columns holding incomparable types raising ``IndexError`` +- Bug in :meth:`DataFrame.rank` with ``axis=0`` and columns holding incomparable types raising ``IndexError`` (:issue:`38932`) - Conversion