Re: EEVDF/vhost regression (bisected to 86bfbb7ce4f6 sched/fair: Add lag based placement)

KVM Archive mirror
 help / color / mirror / Atom feed

From: Peter Zijlstra <peterz@infradead.org>
To: Tobias Huschle <huschle@linux.ibm.com>
Cc: Linux Kernel <linux-kernel@vger.kernel.org>,
	kvm@vger.kernel.org, virtualization@lists.linux.dev,
	netdev@vger.kernel.org, mst@redhat.com, jasowang@redhat.com
Subject: Re: EEVDF/vhost regression (bisected to 86bfbb7ce4f6 sched/fair: Add lag based placement)
Date: Fri, 17 Nov 2023 10:58:41 +0100	[thread overview]
Message-ID: <20231117095841.GL4779@noisy.programming.kicks-ass.net> (raw)
In-Reply-To: <20231117092318.GJ8262@noisy.programming.kicks-ass.net>

On Fri, Nov 17, 2023 at 10:23:18AM +0100, Peter Zijlstra wrote:
> Now, IF this is the problem, I might have a patch that helps:
> 
>   https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/commit/?h=sched/eevdf&id=119feac4fcc77001cd9bf199b25f08d232289a5c

And then I turn around and wipe the repository invalidating that link.

The sched/eevdf branch should be re-instated (with different SHA1), but
I'll include the patch below for reference.

---
Subject: sched/eevdf: Delay dequeue
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri Sep 15 00:48:45 CEST 2023

For tasks that have negative-lag (have received 'excess' service), delay the
dequeue and keep them in the runnable tree until they're eligible again. Or
rather, keep them until they're selected again, since finding their eligibility
crossover point is expensive.

The effect is a bit like sleeper bonus, the tasks keep contending for service
until either they get a wakeup or until they're selected again and are really
dequeued.

This means that any actual dequeue happens with positive lag (serviced owed)
and are more readily ran when woken next.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/sched.h   |    1 
 kernel/sched/core.c     |   88 +++++++++++++++++++++++++++++++++++++++---------
 kernel/sched/fair.c     |   11 ++++++
 kernel/sched/features.h |   11 ++++++
 kernel/sched/sched.h    |    3 +
 5 files changed, 97 insertions(+), 17 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -916,6 +916,7 @@ struct task_struct {
 	unsigned			sched_reset_on_fork:1;
 	unsigned			sched_contributes_to_load:1;
 	unsigned			sched_migrated:1;
+	unsigned			sched_delayed:1;
 
 	/* Force alignment to the next boundary: */
 	unsigned			:0;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3856,12 +3856,23 @@ static int ttwu_runnable(struct task_str
 
 	rq = __task_rq_lock(p, &rf);
 	if (task_on_rq_queued(p)) {
+		update_rq_clock(rq);
+		if (unlikely(p->sched_delayed)) {
+			p->sched_delayed = 0;
+			/* mustn't run a delayed task */
+			WARN_ON_ONCE(task_on_cpu(rq, p));
+			if (sched_feat(GENTLE_DELAY)) {
+				dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
+				if (p->se.vlag > 0)
+					p->se.vlag = 0;
+				enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
+			}
+		}
 		if (!task_on_cpu(rq, p)) {
 			/*
 			 * When on_rq && !on_cpu the task is preempted, see if
 			 * it should preempt the task that is current now.
 			 */
-			update_rq_clock(rq);
 			wakeup_preempt(rq, p, wake_flags);
 		}
 		ttwu_do_wakeup(p);
@@ -6565,6 +6576,24 @@ pick_next_task(struct rq *rq, struct tas
 # define SM_MASK_PREEMPT	SM_PREEMPT
 #endif
 
+static void deschedule_task(struct rq *rq, struct task_struct *p, unsigned long prev_state)
+{
+	p->sched_contributes_to_load =
+		(prev_state & TASK_UNINTERRUPTIBLE) &&
+		!(prev_state & TASK_NOLOAD) &&
+		!(prev_state & TASK_FROZEN);
+
+	if (p->sched_contributes_to_load)
+		rq->nr_uninterruptible++;
+
+	deactivate_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
+
+	if (p->in_iowait) {
+		atomic_inc(&rq->nr_iowait);
+		delayacct_blkio_start();
+	}
+}
+
 /*
  * __schedule() is the main scheduler function.
  *
@@ -6650,6 +6679,8 @@ static void __sched notrace __schedule(u
 
 	switch_count = &prev->nivcsw;
 
+	WARN_ON_ONCE(prev->sched_delayed);
+
 	/*
 	 * We must load prev->state once (task_struct::state is volatile), such
 	 * that we form a control dependency vs deactivate_task() below.
@@ -6659,14 +6690,6 @@ static void __sched notrace __schedule(u
 		if (signal_pending_state(prev_state, prev)) {
 			WRITE_ONCE(prev->__state, TASK_RUNNING);
 		} else {
-			prev->sched_contributes_to_load =
-				(prev_state & TASK_UNINTERRUPTIBLE) &&
-				!(prev_state & TASK_NOLOAD) &&
-				!(prev_state & TASK_FROZEN);
-
-			if (prev->sched_contributes_to_load)
-				rq->nr_uninterruptible++;
-
 			/*
 			 * __schedule()			ttwu()
 			 *   prev_state = prev->state;    if (p->on_rq && ...)
@@ -6678,17 +6701,50 @@ static void __sched notrace __schedule(u
 			 *
 			 * After this, schedule() must not care about p->state any more.
 			 */
-			deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
-
-			if (prev->in_iowait) {
-				atomic_inc(&rq->nr_iowait);
-				delayacct_blkio_start();
-			}
+			if (sched_feat(DELAY_DEQUEUE) &&
+			    prev->sched_class->delay_dequeue_task &&
+			    prev->sched_class->delay_dequeue_task(rq, prev))
+				prev->sched_delayed = 1;
+			else
+				deschedule_task(rq, prev, prev_state);
 		}
 		switch_count = &prev->nvcsw;
 	}
 
-	next = pick_next_task(rq, prev, &rf);
+	for (struct task_struct *tmp = prev;;) {
+		unsigned long tmp_state;
+
+		next = pick_next_task(rq, tmp, &rf);
+		if (unlikely(tmp != prev))
+			finish_task(tmp);
+
+		if (likely(!next->sched_delayed))
+			break;
+
+		next->sched_delayed = 0;
+
+		/*
+		 * A sched_delayed task must not be runnable at this point, see
+		 * ttwu_runnable().
+		 */
+		tmp_state = READ_ONCE(next->__state);
+		if (WARN_ON_ONCE(!tmp_state))
+			break;
+
+		prepare_task(next);
+		/*
+		 * Order ->on_cpu and ->on_rq, see the comments in
+		 * try_to_wake_up(). Normally this is smp_mb__after_spinlock()
+		 * above.
+		 */
+		smp_wmb();
+		deschedule_task(rq, next, tmp_state);
+		if (sched_feat(GENTLE_DELAY) && next->se.vlag > 0)
+			next->se.vlag = 0;
+
+		tmp = next;
+	}
+
 	clear_tsk_need_resched(prev);
 	clear_preempt_need_resched();
 #ifdef CONFIG_SCHED_DEBUG
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8540,6 +8540,16 @@ static struct task_struct *__pick_next_t
 	return pick_next_task_fair(rq, NULL, NULL);
 }
 
+static bool delay_dequeue_task_fair(struct rq *rq, struct task_struct *p)
+{
+	struct sched_entity *se = &p->se;
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+	update_curr(cfs_rq);
+
+	return !entity_eligible(cfs_rq, se);
+}
+
 /*
  * Account for a descheduled task:
  */
@@ -13151,6 +13161,7 @@ DEFINE_SCHED_CLASS(fair) = {
 
 	.wakeup_preempt		= check_preempt_wakeup_fair,
 
+	.delay_dequeue_task	= delay_dequeue_task_fair,
 	.pick_next_task		= __pick_next_task_fair,
 	.put_prev_task		= put_prev_task_fair,
 	.set_next_task          = set_next_task_fair,
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -24,6 +24,17 @@ SCHED_FEAT(PREEMPT_SHORT, true)
  */
 SCHED_FEAT(PLACE_SLEEPER, false)
 SCHED_FEAT(GENTLE_SLEEPER, true)
+/*
+ * Delay dequeueing tasks until they get selected or woken.
+ *
+ * By delaying the dequeue for non-eligible tasks, they remain in the
+ * competition and can burn off their negative lag. When they get selected
+ * they'll have positive lag by definition.
+ *
+ * GENTLE_DELAY clips the lag on dequeue (or wakeup) to 0.
+ */
+SCHED_FEAT(DELAY_DEQUEUE, true)
+SCHED_FEAT(GENTLE_DELAY, true)
 
 /*
  * Prefer to schedule the task we woke last (assuming it failed
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2254,6 +2254,7 @@ struct sched_class {
 
 	void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags);
 
+	bool (*delay_dequeue_task)(struct rq *rq, struct task_struct *p);
 	struct task_struct *(*pick_next_task)(struct rq *rq);
 
 	void (*put_prev_task)(struct rq *rq, struct task_struct *p);
@@ -2307,7 +2308,7 @@ struct sched_class {
 
 static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
-	WARN_ON_ONCE(rq->curr != prev);
+//	WARN_ON_ONCE(rq->curr != prev);
 	prev->sched_class->put_prev_task(rq, prev);
 }

next prev parent reply	other threads:[~2023-11-17  9:58 UTC|newest]

Thread overview: 58+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-11-16 18:58 EEVDF/vhost regression (bisected to 86bfbb7ce4f6 sched/fair: Add lag based placement) Tobias Huschle
2023-11-17  9:23 ` Peter Zijlstra
2023-11-17  9:58   ` Peter Zijlstra [this message]
2023-11-17 12:24   ` Tobias Huschle
2023-11-17 12:37     ` Peter Zijlstra
2023-11-17 13:07       ` Abel Wu
2023-11-21 13:17         ` Tobias Huschle
2023-11-22 10:00           ` Peter Zijlstra
2023-11-27 13:56             ` Tobias Huschle
     [not found]             ` <6564a012.c80a0220.adb78.f0e4SMTPIN_ADDED_BROKEN@mx.google.com>
2023-11-28  8:55               ` Abel Wu
2023-11-29  6:31                 ` Tobias Huschle
2023-12-07  6:22                 ` Tobias Huschle
     [not found]                 ` <07513.123120701265800278@us-mta-474.us.mimecast.lan>
2023-12-07  6:48                   ` Michael S. Tsirkin
2023-12-08  9:24                     ` Tobias Huschle
2023-12-08 17:28                       ` Mike Christie
     [not found]                     ` <56082.123120804242300177@us-mta-137.us.mimecast.lan>
2023-12-08 10:31                       ` Re: " Michael S. Tsirkin
2023-12-08 11:41                         ` Tobias Huschle
     [not found]                         ` <53044.123120806415900549@us-mta-342.us.mimecast.lan>
2023-12-09 10:42                           ` Michael S. Tsirkin
2023-12-11  7:26                             ` Jason Wang
2023-12-11 16:53                               ` Michael S. Tsirkin
2023-12-12  3:00                                 ` Jason Wang
2023-12-12 16:15                                   ` Michael S. Tsirkin
2023-12-13 10:37                                     ` Tobias Huschle
     [not found]                                     ` <42870.123121305373200110@us-mta-641.us.mimecast.lan>
2023-12-13 12:00                                       ` Michael S. Tsirkin
2023-12-13 12:45                                         ` Tobias Huschle
     [not found]                                         ` <25485.123121307454100283@us-mta-18.us.mimecast.lan>
2023-12-13 14:47                                           ` Michael S. Tsirkin
2023-12-13 14:55                                           ` Michael S. Tsirkin
2023-12-14  7:14                                             ` Michael S. Tsirkin
2024-01-08 13:13                                               ` Tobias Huschle
     [not found]                                               ` <92916.124010808133201076@us-mta-622.us.mimecast.lan>
2024-01-09 23:07                                                 ` Michael S. Tsirkin
2024-01-21 18:44                                                 ` Michael S. Tsirkin
2024-01-22 11:29                                                   ` Tobias Huschle
2024-02-01  7:38                                                   ` Tobias Huschle
     [not found]                                                   ` <07974.124020102385100135@us-mta-501.us.mimecast.lan>
2024-02-01  8:08                                                     ` Michael S. Tsirkin
2024-02-01 11:47                                                       ` Tobias Huschle
     [not found]                                                       ` <89460.124020106474400877@us-mta-475.us.mimecast.lan>
2024-02-01 12:08                                                         ` Michael S. Tsirkin
2024-02-22 19:23                                                         ` Michael S. Tsirkin
2024-03-11 17:05                                                         ` Michael S. Tsirkin
2024-03-12  9:45                                                           ` Luis Machado
2024-03-14 11:46                                                             ` Tobias Huschle
     [not found]                                                             ` <73123.124031407552500165@us-mta-156.us.mimecast.lan>
2024-03-14 15:09                                                               ` Michael S. Tsirkin
2024-03-15  8:33                                                                 ` Tobias Huschle
     [not found]                                                                 ` <84704.124031504335801509@us-mta-515.us.mimecast.lan>
2024-03-15 10:31                                                                   ` Michael S. Tsirkin
2024-03-19  8:21                                                                     ` Tobias Huschle
2024-03-19  8:29                                                                       ` Michael S. Tsirkin
2024-03-19  8:59                                                                         ` Tobias Huschle
2024-04-30 10:50                                                                           ` Tobias Huschle
2024-05-01 10:51                                                                             ` Peter Zijlstra
2024-05-01 15:31                                                                               ` Michael S. Tsirkin
2024-05-02  9:16                                                                                 ` Peter Zijlstra
2024-05-02 12:23                                                                                 ` Tobias Huschle
2024-05-02 12:20                                                                               ` Tobias Huschle
2023-11-18  5:14   ` Abel Wu
2023-11-20 10:56     ` Peter Zijlstra
2023-11-20 12:06       ` Abel Wu
2023-11-18  7:33 ` Abel Wu
2023-11-18 15:29   ` Honglei Wang
2023-11-19 13:29 ` Bagas Sanjaya

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20231117095841.GL4779@noisy.programming.kicks-ass.net \
    --to=peterz@infradead.org \
    --cc=huschle@linux.ibm.com \
    --cc=jasowang@redhat.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mst@redhat.com \
    --cc=netdev@vger.kernel.org \
    --cc=virtualization@lists.linux.dev \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).