Skip to content

Commit 978ef7b

Browse files
committed
Working rank with numeric and missing
1 parent 04feeea commit 978ef7b

File tree

3 files changed

+95
-48
lines changed

3 files changed

+95
-48
lines changed

pandas/_libs/groupby.pyx

+39-20
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ cdef int64_t iNaT = get_nat()
2626
cdef double NaN = <double> np.NaN
2727
cdef double nan = NaN
2828

29+
import missing
30+
2931

3032
# TODO: aggregate multiple columns in single pass
3133
# ----------------------------------------------------------------------
@@ -136,11 +138,25 @@ def group_rank_object(ndarray[float64_t, ndim=2] out,
136138
bint pct, ascending
137139

138140
tiebreak = tiebreakers[kwargs['ties_method']]
139-
pct = kwargs['pct']
140141
ascending = kwargs['ascending']
142+
pct = kwargs['pct']
143+
keep_na = kwargs['na_option'] == 'keep'
141144
N, K = (<object> values).shape
142145

143-
_as = np.lexsort((values[:, 0], labels))
146+
vals = np.array(values[:, 0], copy=True)
147+
mask = missing.isnaobj(vals)
148+
149+
try:
150+
_as = np.lexsort((vals, labels))
151+
except TypeError:
152+
# lexsort fails when missing data and objects are mixed
153+
# fallback to argsort
154+
order = (vals, mask, labels)
155+
_values = np.asarray(list(zip(order[0], order[1], order[2])),
156+
dtype=[('values', 'O'), ('mask', '?'),
157+
('labels', 'i8')])
158+
_as = np.argsort(_values, kind='mergesort', order=('labels',
159+
'mask', 'values'))
144160

145161
if not ascending:
146162
_as = _as[::-1]
@@ -149,24 +165,27 @@ def group_rank_object(ndarray[float64_t, ndim=2] out,
149165
dups += 1
150166
sum_ranks += i - grp_start + 1
151167

152-
if tiebreak == TIEBREAK_AVERAGE:
153-
for j in range(i - dups + 1, i + 1):
154-
out[_as[j], 0] = sum_ranks / dups
155-
elif tiebreak == TIEBREAK_MIN:
156-
for j in range(i - dups + 1, i + 1):
157-
out[_as[j], 0] = i - grp_start - dups + 2
158-
elif tiebreak == TIEBREAK_MAX:
159-
for j in range(i - dups + 1, i + 1):
160-
out[_as[j], 0] = i - grp_start + 1
161-
elif tiebreak == TIEBREAK_FIRST:
162-
for j in range(i - dups + 1, i + 1):
163-
if ascending:
164-
out[_as[j], 0] = j + 1
165-
else:
166-
out[_as[j], 0] = 2 * i - j - dups + 2
167-
elif tiebreak == TIEBREAK_DENSE:
168-
for j in range(i - dups + 1, i + 1):
169-
out[_as[j], 0] = vals_seen
168+
if keep_na and mask[_as[i]]:
169+
out[_as[i], 0] = np.nan
170+
else:
171+
if tiebreak == TIEBREAK_AVERAGE:
172+
for j in range(i - dups + 1, i + 1):
173+
out[_as[j], 0] = sum_ranks / dups
174+
elif tiebreak == TIEBREAK_MIN:
175+
for j in range(i - dups + 1, i + 1):
176+
out[_as[j], 0] = i - grp_start - dups + 2
177+
elif tiebreak == TIEBREAK_MAX:
178+
for j in range(i - dups + 1, i + 1):
179+
out[_as[j], 0] = i - grp_start + 1
180+
elif tiebreak == TIEBREAK_FIRST:
181+
for j in range(i - dups + 1, i + 1):
182+
if ascending:
183+
out[_as[j], 0] = j + 1
184+
else:
185+
out[_as[j], 0] = 2 * i - j - dups + 2
186+
elif tiebreak == TIEBREAK_DENSE:
187+
for j in range(i - dups + 1, i + 1):
188+
out[_as[j], 0] = vals_seen
170189

171190
if (i == N - 1 or (
172191
(values[_as[i], 0] != values[_as[i+1], 0]) and not

pandas/_libs/groupby_helper.pxi.in

+51-24
Original file line numberDiff line numberDiff line change
@@ -458,15 +458,35 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
458458
int tiebreak
459459
Py_ssize_t i, j, N, K
460460
int64_t val_start=0, grp_start=0, dups=0, sum_ranks=0, vals_seen=1
461+
int64_t grp_na_count=0
461462
ndarray[int64_t] _as
462-
bint pct, ascending
463+
ndarray[{{c_type}}] _values
464+
ndarray[uint8_t] mask
465+
bint pct, ascending, keep_na
463466

464467
tiebreak = tiebreakers[kwargs['ties_method']]
465468
ascending = kwargs['ascending']
466469
pct = kwargs['pct']
470+
keep_na = kwargs['na_option'] == 'keep'
467471
N, K = (<object> values).shape
468472

469-
_as = np.lexsort((values[:, 0], labels))
473+
_values = np.array(values[:, 0], copy=True)
474+
475+
mask = np.isnan(_values).astype(np.uint8)
476+
{{if name == 'int64' }}
477+
order = (_values, labels)
478+
{{else}}
479+
if ascending ^ (kwargs['na_option'] == 'top'):
480+
nan_value = np.inf
481+
order = (_values, mask, labels)
482+
else:
483+
nan_value = -np.inf
484+
order = (_values, ~mask, labels)
485+
np.putmask(_values, mask, nan_value)
486+
{{endif}}
487+
488+
_as = np.lexsort(order)
489+
470490

471491
if not ascending:
472492
_as = _as[::-1]
@@ -476,38 +496,45 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
476496
dups += 1
477497
sum_ranks += i - grp_start + 1
478498

479-
if tiebreak == TIEBREAK_AVERAGE:
480-
for j in range(i - dups + 1, i + 1):
481-
out[_as[j], 0] = sum_ranks / dups
482-
elif tiebreak == TIEBREAK_MIN:
483-
for j in range(i - dups + 1, i + 1):
484-
out[_as[j], 0] = i - grp_start - dups + 2
485-
elif tiebreak == TIEBREAK_MAX:
486-
for j in range(i - dups + 1, i + 1):
487-
out[_as[j], 0] = i - grp_start + 1
488-
elif tiebreak == TIEBREAK_FIRST:
489-
for j in range(i - dups + 1, i + 1):
490-
if ascending:
491-
out[_as[j], 0] = j + 1
492-
else:
493-
out[_as[j], 0] = 2 * i - j - dups + 2
494-
elif tiebreak == TIEBREAK_DENSE:
495-
for j in range(i - dups + 1, i + 1):
496-
out[_as[j], 0] = vals_seen
499+
if keep_na and (values[_as[i], 0] != values[_as[i], 0]):
500+
grp_na_count += 1
501+
out[_as[i], 0] = {{nan_val}}
502+
else:
503+
if tiebreak == TIEBREAK_AVERAGE:
504+
for j in range(i - dups + 1, i + 1):
505+
out[_as[j], 0] = sum_ranks / dups
506+
elif tiebreak == TIEBREAK_MIN:
507+
for j in range(i - dups + 1, i + 1):
508+
out[_as[j], 0] = i - grp_start - dups + 2
509+
elif tiebreak == TIEBREAK_MAX:
510+
for j in range(i - dups + 1, i + 1):
511+
out[_as[j], 0] = i - grp_start + 1
512+
elif tiebreak == TIEBREAK_FIRST:
513+
for j in range(i - dups + 1, i + 1):
514+
if ascending:
515+
out[_as[j], 0] = j + 1
516+
else:
517+
out[_as[j], 0] = 2 * i - j - dups + 2
518+
elif tiebreak == TIEBREAK_DENSE:
519+
for j in range(i - dups + 1, i + 1):
520+
out[_as[j], 0] = vals_seen
497521

498522
if (i == N - 1 or (
499-
(values[_as[i], 0] != values[_as[i+1], 0]) and not
500-
(isnan(values[_as[i], 0]) and
501-
isnan(values[_as[i+1], 0])
523+
(_values[_as[i]] != _values[_as[i+1]]) and not
524+
(isnan(_values[_as[i]]) and
525+
isnan(_values[_as[i+1]])
502526
))):
503527
dups = sum_ranks = 0
504528
val_start = i
505529
vals_seen += 1
506530

531+
# Move to the next group, cleaning up any values
507532
if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]:
508533
if pct:
509534
for j in range(grp_start, i + 1):
510-
out[_as[j], 0] = out[_as[j], 0] / (i - grp_start + 1)
535+
out[_as[j], 0] = out[_as[j], 0] / (i - grp_start + 1
536+
- grp_na_count)
537+
grp_na_count = 0
511538
grp_start = i + 1
512539
vals_seen = 1
513540

pandas/tests/groupby/test_groupby.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -1952,10 +1952,11 @@ def test_rank_args(self, vals, ties_method, ascending, pct, exp):
19521952

19531953
@pytest.mark.parametrize("vals", [
19541954
[2, 2, np.nan, 8, 2, 6, np.nan, np.nan], # floats
1955-
['bar', 'bar', np.nan, 'foo', 'bar', 'baz', np.nan, np.nan], # objects
1956-
[pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), np.nan,
1957-
pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'),
1958-
pd.Timestamp('2018-01-06'), np.nan, np.nan]])
1955+
#['bar', 'bar', np.nan, 'foo', 'bar', 'baz', np.nan, np.nan], # objects
1956+
#[pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), np.nan,
1957+
# pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'),
1958+
# pd.Timestamp('2018-01-06'), np.nan, np.nan]
1959+
])
19591960
@pytest.mark.parametrize("ties_method,ascending,na_option,pct,exp", [
19601961
('average', True, 'keep', False, DataFrame(
19611962
[2., 2., np.nan, 5., 2., 4., np.nan, np.nan], columns=['val'])),

0 commit comments

Comments
 (0)