Skip to content

BUG/REF: use sorted_rank_1d for rank_2d #41931

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 27 commits into from
Jun 25, 2021
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
c7a91ac
REF: split out sorted_rank algo
mzeitlin11 Jun 9, 2021
4b0641e
Fixup docstring
mzeitlin11 Jun 9, 2021
b6dd4a6
WIP
mzeitlin11 Jun 9, 2021
953b188
WIP
mzeitlin11 Jun 10, 2021
254b997
premerge
mzeitlin11 Jun 10, 2021
5602dca
Merge remote-tracking branch 'upstream/master' into ref/rank_2d_dedup
mzeitlin11 Jun 10, 2021
29dc590
REF: give ranks same nan filling
mzeitlin11 Jun 10, 2021
9abd9da
Merge branch 'rank_2d_dedup_chunk' into ref/rank_2d_dedup
mzeitlin11 Jun 10, 2021
974650d
WIP
mzeitlin11 Jun 10, 2021
b840b74
Handle empty case early
mzeitlin11 Jun 10, 2021
f099bb0
Handle empty case early
mzeitlin11 Jun 10, 2021
4aa4f8b
WIP
mzeitlin11 Jun 10, 2021
c5ed688
WIP
mzeitlin11 Jun 10, 2021
7a04159
Add object first test
mzeitlin11 Jun 10, 2021
ab9989e
Add back nogil
mzeitlin11 Jun 10, 2021
5ba6459
Add whatsnew
mzeitlin11 Jun 10, 2021
6154004
Cleaner fused type handling
mzeitlin11 Jun 10, 2021
d678bbf
Merge branch 'rank_2d_dedup_chunk' into ref/rank_2d_dedup
mzeitlin11 Jun 10, 2021
0f8744d
Add comment
mzeitlin11 Jun 10, 2021
d47f2a6
Update whatsnew
mzeitlin11 Jun 10, 2021
da61fb8
Try 32-bit fix
mzeitlin11 Jun 10, 2021
e2d9617
Debug 32-bit
mzeitlin11 Jun 10, 2021
b4d11a4
Debug 32-bit
mzeitlin11 Jun 11, 2021
9a94724
Merge remote-tracking branch 'upstream/master' into ref/rank_2d_dedup
mzeitlin11 Jun 17, 2021
1e47dae
Move whatsnew
mzeitlin11 Jun 17, 2021
4d72d93
Merge remote-tracking branch 'upstream/master' into ref/rank_2d_dedup
mzeitlin11 Jun 22, 2021
d7b398b
Merge remote-tracking branch 'upstream/master' into ref/rank_2d_dedup
mzeitlin11 Jun 24, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,8 @@ Timezones

Numeric
^^^^^^^
-
- Bug in :meth:`DataFrame.rank` raising ``ValueError`` with ``object`` columns and ``method="first"`` (:issue:`41931`)
- Bug in :meth:`DataFrame.rank` treating missing values and extreme values as equal (for example ``np.nan`` and ``np.inf``), causing incorrect results when ``na_option="bottom"`` or ``na_option="top`` used (:issue:`41931`)
-

Conversion
Expand Down
144 changes: 55 additions & 89 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1372,26 +1372,29 @@ def rank_2d(
Fast NaN-friendly version of ``scipy.stats.rankdata``.
"""
cdef:
Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0
Py_ssize_t infs
ndarray[float64_t, ndim=2] ranks
Py_ssize_t k, n, col
float64_t[::1, :] out # Column-major so columns are contiguous
int64_t[::1, :] grp_sizes
const intp_t[:] labels
ndarray[rank_t, ndim=2] values
ndarray[intp_t, ndim=2] argsort_indexer
ndarray[uint8_t, ndim=2] mask
rank_t val, nan_fill_val
float64_t count, sum_ranks = 0.0
int tiebreak = 0
int64_t idx
bint check_mask, condition, keep_na, nans_rank_highest
rank_t[:, :] masked_vals
intp_t[:, :] sort_indexer
uint8_t[:, :] mask
TiebreakEnumType tiebreak
bint check_mask, keep_na, nans_rank_highest
rank_t nan_fill_val

tiebreak = tiebreakers[ties_method]
if tiebreak == TIEBREAK_FIRST:
if not ascending:
tiebreak = TIEBREAK_FIRST_DESCENDING

keep_na = na_option == 'keep'

# For cases where a mask is not possible, we can avoid mask checks
check_mask = not (rank_t is uint64_t or (rank_t is int64_t and not is_datetimelike))

if axis == 0:
if axis == 1:
values = np.asarray(in_arr).T.copy()
else:
values = np.asarray(in_arr).copy()
Expand All @@ -1403,99 +1406,62 @@ def rank_2d(
nans_rank_highest = ascending ^ (na_option == 'top')
if check_mask:
nan_fill_val = get_rank_nan_fill_val[rank_t](nans_rank_highest)

if rank_t is object:
mask = missing.isnaobj2d(values)
mask = missing.isnaobj2d(values).view(np.uint8)
elif rank_t is float64_t:
mask = np.isnan(values)
mask = np.isnan(values).view(np.uint8)

# int64 and datetimelike
else:
mask = values == NPY_NAT

mask = (values == NPY_NAT).view(np.uint8)
np.putmask(values, mask, nan_fill_val)
else:
mask = np.zeros_like(values, dtype=bool)
mask = np.zeros_like(values, dtype=np.uint8)

if nans_rank_highest:
order = (values, mask)
else:
order = (values, ~np.asarray(mask))

n, k = (<object>values).shape
ranks = np.empty((n, k), dtype='f8')
out = np.empty((n, k), dtype='f8', order='F')
grp_sizes = np.ones((n, k), dtype='i8', order='F')
labels = np.zeros(n, dtype=np.intp)

if tiebreak == TIEBREAK_FIRST:
# need to use a stable sort here
argsort_indexer = values.argsort(axis=1, kind='mergesort')
if not ascending:
tiebreak = TIEBREAK_FIRST_DESCENDING
# lexsort is slower, so only use if we need to worry about the mask
if check_mask:
sort_indexer = np.lexsort(order, axis=0).astype(np.intp, copy=False)
else:
argsort_indexer = values.argsort(1)
kind = "stable" if ties_method == "first" else None
sort_indexer = values.argsort(axis=0, kind=kind).astype(np.intp, copy=False)

if not ascending:
argsort_indexer = argsort_indexer[:, ::-1]

values = _take_2d(values, argsort_indexer)
sort_indexer = sort_indexer[::-1, :]

for i in range(n):
dups = sum_ranks = infs = 0

total_tie_count = 0
count = 0.0
for j in range(k):
val = values[i, j]
idx = argsort_indexer[i, j]
if keep_na and check_mask and mask[i, idx]:
ranks[i, idx] = NaN
infs += 1
continue

count += 1.0

sum_ranks += (j - infs) + 1
dups += 1

if rank_t is object:
condition = (
j == k - 1 or
are_diff(values[i, j + 1], val) or
(keep_na and check_mask and mask[i, argsort_indexer[i, j + 1]])
)
else:
condition = (
j == k - 1 or
values[i, j + 1] != val or
(keep_na and check_mask and mask[i, argsort_indexer[i, j + 1]])
)

if condition:
if tiebreak == TIEBREAK_AVERAGE:
for z in range(j - dups + 1, j + 1):
ranks[i, argsort_indexer[i, z]] = sum_ranks / dups
elif tiebreak == TIEBREAK_MIN:
for z in range(j - dups + 1, j + 1):
ranks[i, argsort_indexer[i, z]] = j - dups + 2
elif tiebreak == TIEBREAK_MAX:
for z in range(j - dups + 1, j + 1):
ranks[i, argsort_indexer[i, z]] = j + 1
elif tiebreak == TIEBREAK_FIRST:
if rank_t is object:
raise ValueError('first not supported for non-numeric data')
else:
for z in range(j - dups + 1, j + 1):
ranks[i, argsort_indexer[i, z]] = z + 1
elif tiebreak == TIEBREAK_FIRST_DESCENDING:
for z in range(j - dups + 1, j + 1):
ranks[i, argsort_indexer[i, z]] = 2 * j - z - dups + 2
elif tiebreak == TIEBREAK_DENSE:
total_tie_count += 1
for z in range(j - dups + 1, j + 1):
ranks[i, argsort_indexer[i, z]] = total_tie_count
sum_ranks = dups = 0
if pct:
if tiebreak == TIEBREAK_DENSE:
ranks[i, :] /= total_tie_count
else:
ranks[i, :] /= count
if axis == 0:
return ranks.T
# putmask doesn't accept a memoryview, so we assign in a separate step
masked_vals = values
with nogil:
for col in range(k):
rank_sorted_1d(
out[:, col],
grp_sizes[:, col],
labels,
sort_indexer[:, col],
masked_vals[:, col],
mask[:, col],
tiebreak,
check_mask,
False,
keep_na,
pct,
n,
)

if axis == 1:
return np.asarray(out.T)
else:
return ranks
return np.asarray(out)


ctypedef fused diff_t:
Expand Down
30 changes: 0 additions & 30 deletions pandas/_libs/algos_take_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -219,33 +219,3 @@ def take_2d_multi_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
{{endif}}

{{endfor}}

# ----------------------------------------------------------------------
# take_2d internal function
# ----------------------------------------------------------------------

ctypedef fused take_t:
float64_t
uint64_t
int64_t
object


cdef _take_2d(ndarray[take_t, ndim=2] values, ndarray[intp_t, ndim=2] idx):
cdef:
Py_ssize_t i, j, N, K
ndarray[intp_t, ndim=2, cast=True] indexer = idx
ndarray[take_t, ndim=2] result

N, K = (<object>values).shape

if take_t is object:
# evaluated at compile-time
result = values.copy()
else:
result = np.empty_like(values)

for i in range(N):
for j in range(K):
result[i, j] = values[i, indexer[i, j]]
return result
42 changes: 33 additions & 9 deletions pandas/tests/frame/methods/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,23 +246,18 @@ def test_rank_methods_frame(self):
expected = DataFrame(sprank, columns=cols).astype("float64")
tm.assert_frame_equal(result, expected)

@td.skip_array_manager_not_yet_implemented
@pytest.mark.parametrize("dtype", ["O", "f8", "i8"])
@pytest.mark.filterwarnings("ignore:.*Select only valid:FutureWarning")
def test_rank_descending(self, method, dtype):

if "i" in dtype:
df = self.df.dropna()
df = self.df.dropna().astype(dtype)
else:
df = self.df.astype(dtype)

res = df.rank(ascending=False)
expected = (df.max() - df).rank()
tm.assert_frame_equal(res, expected)

if method == "first" and dtype == "O":
return

expected = (df.max() - df).rank(method=method)

if dtype != "O":
Expand All @@ -287,9 +282,6 @@ def _check2d(df, expected, method="average", axis=0):
result = df.rank(method=method, axis=axis)
tm.assert_frame_equal(result, exp_df)

disabled = {(object, "first")}
if (dtype, method) in disabled:
return
frame = df if dtype is None else df.astype(dtype)
_check2d(frame, self.results[method], method=method, axis=axis)

Expand Down Expand Up @@ -456,6 +448,38 @@ def test_rank_both_inf(self):
result = df.rank()
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
"na_option,ascending,expected",
[
("top", True, [3.0, 1.0, 2.0]),
("top", False, [2.0, 1.0, 3.0]),
("bottom", True, [2.0, 3.0, 1.0]),
("bottom", False, [1.0, 3.0, 2.0]),
],
)
def test_rank_inf_nans_na_option(
self, frame_or_series, method, na_option, ascending, expected
):
obj = frame_or_series([np.inf, np.nan, -np.inf])
result = obj.rank(method=method, na_option=na_option, ascending=ascending)
expected = frame_or_series(expected)
tm.assert_equal(result, expected)

@pytest.mark.parametrize(
"na_option,ascending,expected",
[
("bottom", True, [1.0, 2.0, 4.0, 3.0]),
("bottom", False, [1.0, 2.0, 4.0, 3.0]),
("top", True, [2.0, 3.0, 1.0, 4.0]),
("top", False, [2.0, 3.0, 1.0, 4.0]),
],
)
def test_rank_object_first(self, frame_or_series, na_option, ascending, expected):
obj = frame_or_series(["foo", "foo", None, "foo"])
result = obj.rank(method="first", na_option=na_option, ascending=ascending)
expected = frame_or_series(expected)
tm.assert_equal(result, expected)

@pytest.mark.parametrize(
"data,expected",
[
Expand Down