Skip to content
This repository was archived by the owner on Nov 8, 2023. It is now read-only.

Commit 152e11f

Browse files
author
Peter Zijlstra
committed
sched/fair: Implement delayed dequeue
Extend / fix 86bfbb7 ("sched/fair: Add lag based placement") by noting that lag is fundamentally a temporal measure. It should not be carried around indefinitely. OTOH it should also not be instantly discarded, doing so will allow a task to game the system by purposefully (micro) sleeping at the end of its time quantum. Since lag is intimately tied to the virtual time base, a wall-time based decay is also insufficient, notably competition is required for any of this to make sense. Instead, delay the dequeue and keep the 'tasks' on the runqueue, competing until they are eligible. Strictly speaking, we only care about keeping them until the 0-lag point, but that is a difficult proposition, instead carry them around until they get picked again, and dequeue them at that point. Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Reviewed-by: Valentin Schneider <[email protected]> Tested-by: Valentin Schneider <[email protected]> Link: https://lkml.kernel.org/r/[email protected]
1 parent e1459a5 commit 152e11f

File tree

3 files changed

+79
-11
lines changed

3 files changed

+79
-11
lines changed

kernel/sched/deadline.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2428,7 +2428,6 @@ static struct task_struct *__pick_next_task_dl(struct rq *rq, bool peek)
24282428
else
24292429
p = dl_se->server_pick_next(dl_se);
24302430
if (!p) {
2431-
WARN_ON_ONCE(1);
24322431
dl_se->dl_yielded = 1;
24332432
update_curr_dl_se(rq, dl_se, 0);
24342433
goto again;

kernel/sched/fair.c

Lines changed: 70 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5379,19 +5379,38 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
53795379

53805380
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
53815381

5382-
static void
5382+
static bool
53835383
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
53845384
{
5385-
int action = UPDATE_TG;
5385+
update_curr(cfs_rq);
5386+
5387+
if (flags & DEQUEUE_DELAYED) {
5388+
SCHED_WARN_ON(!se->sched_delayed);
5389+
} else {
5390+
bool sleep = flags & DEQUEUE_SLEEP;
53865391

5392+
/*
5393+
* DELAY_DEQUEUE relies on spurious wakeups, special task
5394+
* states must not suffer spurious wakeups, excempt them.
5395+
*/
5396+
if (flags & DEQUEUE_SPECIAL)
5397+
sleep = false;
5398+
5399+
SCHED_WARN_ON(sleep && se->sched_delayed);
5400+
5401+
if (sched_feat(DELAY_DEQUEUE) && sleep &&
5402+
!entity_eligible(cfs_rq, se)) {
5403+
if (cfs_rq->next == se)
5404+
cfs_rq->next = NULL;
5405+
se->sched_delayed = 1;
5406+
return false;
5407+
}
5408+
}
5409+
5410+
int action = UPDATE_TG;
53875411
if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
53885412
action |= DO_DETACH;
53895413

5390-
/*
5391-
* Update run-time statistics of the 'current'.
5392-
*/
5393-
update_curr(cfs_rq);
5394-
53955414
/*
53965415
* When dequeuing a sched_entity, we must:
53975416
* - Update loads to have both entity and cfs_rq synced with now.
@@ -5428,8 +5447,13 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
54285447
if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
54295448
update_min_vruntime(cfs_rq);
54305449

5450+
if (flags & DEQUEUE_DELAYED)
5451+
se->sched_delayed = 0;
5452+
54315453
if (cfs_rq->nr_running == 0)
54325454
update_idle_cfs_rq_clock_pelt(cfs_rq);
5455+
5456+
return true;
54335457
}
54345458

54355459
static void
@@ -5828,11 +5852,21 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
58285852
idle_task_delta = cfs_rq->idle_h_nr_running;
58295853
for_each_sched_entity(se) {
58305854
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
5855+
int flags;
5856+
58315857
/* throttled entity or throttle-on-deactivate */
58325858
if (!se->on_rq)
58335859
goto done;
58345860

5835-
dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
5861+
/*
5862+
* Abuse SPECIAL to avoid delayed dequeue in this instance.
5863+
* This avoids teaching dequeue_entities() about throttled
5864+
* entities and keeps things relatively simple.
5865+
*/
5866+
flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
5867+
if (se->sched_delayed)
5868+
flags |= DEQUEUE_DELAYED;
5869+
dequeue_entity(qcfs_rq, se, flags);
58365870

58375871
if (cfs_rq_is_idle(group_cfs_rq(se)))
58385872
idle_task_delta = cfs_rq->h_nr_running;
@@ -6918,6 +6952,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
69186952
bool was_sched_idle = sched_idle_rq(rq);
69196953
int rq_h_nr_running = rq->cfs.h_nr_running;
69206954
bool task_sleep = flags & DEQUEUE_SLEEP;
6955+
bool task_delayed = flags & DEQUEUE_DELAYED;
69216956
struct task_struct *p = NULL;
69226957
int idle_h_nr_running = 0;
69236958
int h_nr_running = 0;
@@ -6931,7 +6966,13 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
69316966

69326967
for_each_sched_entity(se) {
69336968
cfs_rq = cfs_rq_of(se);
6934-
dequeue_entity(cfs_rq, se, flags);
6969+
6970+
if (!dequeue_entity(cfs_rq, se, flags)) {
6971+
if (p && &p->se == se)
6972+
return -1;
6973+
6974+
break;
6975+
}
69356976

69366977
cfs_rq->h_nr_running -= h_nr_running;
69376978
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
@@ -6956,6 +6997,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
69566997
break;
69576998
}
69586999
flags |= DEQUEUE_SLEEP;
7000+
flags &= ~(DEQUEUE_DELAYED | DEQUEUE_SPECIAL);
69597001
}
69607002

69617003
for_each_sched_entity(se) {
@@ -6985,6 +7027,17 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
69857027
if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
69867028
rq->next_balance = jiffies;
69877029

7030+
if (p && task_delayed) {
7031+
SCHED_WARN_ON(!task_sleep);
7032+
SCHED_WARN_ON(p->on_rq != 1);
7033+
7034+
/* Fix-up what dequeue_task_fair() skipped */
7035+
hrtick_update(rq);
7036+
7037+
/* Fix-up what block_task() skipped. */
7038+
__block_task(rq, p);
7039+
}
7040+
69887041
return 1;
69897042
}
69907043

@@ -6997,8 +7050,10 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
69977050
{
69987051
util_est_dequeue(&rq->cfs, p);
69997052

7000-
if (dequeue_entities(rq, &p->se, flags) < 0)
7053+
if (dequeue_entities(rq, &p->se, flags) < 0) {
7054+
util_est_update(&rq->cfs, p, DEQUEUE_SLEEP);
70017055
return false;
7056+
}
70027057

70037058
util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
70047059
hrtick_update(rq);
@@ -12971,6 +13026,11 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
1297113026
/* ensure bandwidth has been allocated on our new cfs_rq */
1297213027
account_cfs_rq_runtime(cfs_rq, 0);
1297313028
}
13029+
13030+
if (!first)
13031+
return;
13032+
13033+
SCHED_WARN_ON(se->sched_delayed);
1297413034
}
1297513035

1297613036
void init_cfs_rq(struct cfs_rq *cfs_rq)

kernel/sched/features.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,15 @@ SCHED_FEAT(NEXT_BUDDY, false)
2828
*/
2929
SCHED_FEAT(CACHE_HOT_BUDDY, true)
3030

31+
/*
32+
* Delay dequeueing tasks until they get selected or woken.
33+
*
34+
* By delaying the dequeue for non-eligible tasks, they remain in the
35+
* competition and can burn off their negative lag. When they get selected
36+
* they'll have positive lag by definition.
37+
*/
38+
SCHED_FEAT(DELAY_DEQUEUE, true)
39+
3140
/*
3241
* Allow wakeup-time preemption of the current task:
3342
*/

0 commit comments

Comments
 (0)