All the mail mirrored from lore.kernel.org
 help / color / mirror / Atom feed
From: Ingo Molnar <mingo@kernel.org>
To: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org,
	Peter Zijlstra <a.p.zijlstra@chello.nl>,
	Thomas Gleixner <tglx@linutronix.de>,
	Andrew Morton <akpm@linux-foundation.org>
Subject: [GIT PULL] scheduler fix
Date: Sat, 17 Nov 2018 11:57:57 +0100	[thread overview]
Message-ID: <20181117105757.GA40115@gmail.com> (raw)

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus

   # HEAD: c469933e772132aad040bd6a2adc8edf9ad6f825 sched/fair: Fix cpu_util_wake() for 'execl' type workloads

Fix an exec() related scalability/performance regression, which was 
caused by incorrectly calculating load and migrating tasks on exec() when 
they shouldn't be.

 Thanks,

	Ingo

------------------>
Patrick Bellasi (1):
      sched/fair: Fix cpu_util_wake() for 'execl' type workloads


 kernel/sched/fair.c | 62 +++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 48 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3648d0300fdf..ac855b2f4774 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5674,11 +5674,11 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 	return target;
 }
 
-static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
+static unsigned long cpu_util_without(int cpu, struct task_struct *p);
 
-static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
+static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
 {
-	return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
+	return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
 }
 
 /*
@@ -5738,7 +5738,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 
 			avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
 
-			spare_cap = capacity_spare_wake(i, p);
+			spare_cap = capacity_spare_without(i, p);
 
 			if (spare_cap > max_spare_cap)
 				max_spare_cap = spare_cap;
@@ -5889,8 +5889,8 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
 		return prev_cpu;
 
 	/*
-	 * We need task's util for capacity_spare_wake, sync it up to prev_cpu's
-	 * last_update_time.
+	 * We need task's util for capacity_spare_without, sync it up to
+	 * prev_cpu's last_update_time.
 	 */
 	if (!(sd_flag & SD_BALANCE_FORK))
 		sync_entity_load_avg(&p->se);
@@ -6216,10 +6216,19 @@ static inline unsigned long cpu_util(int cpu)
 }
 
 /*
- * cpu_util_wake: Compute CPU utilization with any contributions from
- * the waking task p removed.
+ * cpu_util_without: compute cpu utilization without any contributions from *p
+ * @cpu: the CPU which utilization is requested
+ * @p: the task which utilization should be discounted
+ *
+ * The utilization of a CPU is defined by the utilization of tasks currently
+ * enqueued on that CPU as well as tasks which are currently sleeping after an
+ * execution on that CPU.
+ *
+ * This method returns the utilization of the specified CPU by discounting the
+ * utilization of the specified task, whenever the task is currently
+ * contributing to the CPU utilization.
  */
-static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
+static unsigned long cpu_util_without(int cpu, struct task_struct *p)
 {
 	struct cfs_rq *cfs_rq;
 	unsigned int util;
@@ -6231,7 +6240,7 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
 	cfs_rq = &cpu_rq(cpu)->cfs;
 	util = READ_ONCE(cfs_rq->avg.util_avg);
 
-	/* Discount task's blocked util from CPU's util */
+	/* Discount task's util from CPU's util */
 	util -= min_t(unsigned int, util, task_util(p));
 
 	/*
@@ -6240,14 +6249,14 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
 	 * a) if *p is the only task sleeping on this CPU, then:
 	 *      cpu_util (== task_util) > util_est (== 0)
 	 *    and thus we return:
-	 *      cpu_util_wake = (cpu_util - task_util) = 0
+	 *      cpu_util_without = (cpu_util - task_util) = 0
 	 *
 	 * b) if other tasks are SLEEPING on this CPU, which is now exiting
 	 *    IDLE, then:
 	 *      cpu_util >= task_util
 	 *      cpu_util > util_est (== 0)
 	 *    and thus we discount *p's blocked utilization to return:
-	 *      cpu_util_wake = (cpu_util - task_util) >= 0
+	 *      cpu_util_without = (cpu_util - task_util) >= 0
 	 *
 	 * c) if other tasks are RUNNABLE on that CPU and
 	 *      util_est > cpu_util
@@ -6260,8 +6269,33 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
 	 * covered by the following code when estimated utilization is
 	 * enabled.
 	 */
-	if (sched_feat(UTIL_EST))
-		util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
+	if (sched_feat(UTIL_EST)) {
+		unsigned int estimated =
+			READ_ONCE(cfs_rq->avg.util_est.enqueued);
+
+		/*
+		 * Despite the following checks we still have a small window
+		 * for a possible race, when an execl's select_task_rq_fair()
+		 * races with LB's detach_task():
+		 *
+		 *   detach_task()
+		 *     p->on_rq = TASK_ON_RQ_MIGRATING;
+		 *     ---------------------------------- A
+		 *     deactivate_task()                   \
+		 *       dequeue_task()                     + RaceTime
+		 *         util_est_dequeue()              /
+		 *     ---------------------------------- B
+		 *
+		 * The additional check on "current == p" it's required to
+		 * properly fix the execl regression and it helps in further
+		 * reducing the chances for the above race.
+		 */
+		if (unlikely(task_on_rq_queued(p) || current == p)) {
+			estimated -= min_t(unsigned int, estimated,
+					   (_task_util_est(p) | UTIL_AVG_UNCHANGED));
+		}
+		util = max(util, estimated);
+	}
 
 	/*
 	 * Utilization (estimated) can exceed the CPU capacity, thus let's

             reply	other threads:[~2018-11-17 10:58 UTC|newest]

Thread overview: 69+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-11-17 10:57 Ingo Molnar [this message]
2018-11-18 20:05 ` [GIT PULL] scheduler fix pr-tracker-bot
  -- strict thread matches above, loose matches on Subject: below --
2023-10-01  8:43 Ingo Molnar
2023-10-01 17:08 ` pr-tracker-bot
2023-09-22 10:26 Ingo Molnar
2023-09-22 20:19 ` pr-tracker-bot
2021-06-24  7:06 Ingo Molnar
2021-06-24 16:34 ` pr-tracker-bot
2020-12-27  9:16 Ingo Molnar
2020-12-27 17:27 ` pr-tracker-bot
2020-03-02  7:51 Ingo Molnar
2020-03-03 23:35 ` pr-tracker-bot
2019-12-17 11:54 Ingo Molnar
2019-12-17 19:20 ` pr-tracker-bot
2019-07-14 10:19 Ingo Molnar
2019-07-14 18:45 ` pr-tracker-bot
2019-05-05 11:02 Ingo Molnar
2019-05-05 22:10 ` pr-tracker-bot
2019-04-27 14:39 Ingo Molnar
2019-04-27 18:45 ` pr-tracker-bot
2019-04-12 13:08 Ingo Molnar
2019-04-13  4:05 ` pr-tracker-bot
2018-12-31 14:58 Ingo Molnar
2018-12-31 18:05 ` pr-tracker-bot
2018-10-11  9:12 Ingo Molnar
2018-10-11 12:32 ` Greg Kroah-Hartman
2018-10-11  9:02 Ingo Molnar
2018-01-17 15:34 Ingo Molnar
2017-10-27 19:16 Ingo Molnar
2016-12-07 18:48 Ingo Molnar
2016-10-28  8:35 Ingo Molnar
2016-10-19 15:52 Ingo Molnar
2016-10-18 11:17 Ingo Molnar
2016-09-13 18:17 Ingo Molnar
2016-07-14 18:56 Ingo Molnar
2016-05-13 18:54 Ingo Molnar
2016-05-06 11:31 Ingo Molnar
2015-07-18  2:56 Ingo Molnar
2015-03-28 13:45 Ingo Molnar
2014-01-15 18:19 Ingo Molnar
2013-09-28 18:08 Ingo Molnar
2013-09-12 12:58 Ingo Molnar
2012-05-17  8:46 Ingo Molnar
2012-03-02 10:57 Ingo Molnar
2012-02-27 10:29 Ingo Molnar
2011-04-07 17:38 Ingo Molnar
2011-03-18 13:52 Ingo Molnar
2011-03-10  8:01 Ingo Molnar
2011-01-24 13:07 Ingo Molnar
2010-04-08 15:38 Ingo Molnar
2010-04-08 15:42 ` Linus Torvalds
2010-04-08 16:03   ` Andreas Schwab
2010-04-08 18:26     ` Ingo Molnar
2010-04-08 18:36       ` Linus Torvalds
2010-04-08 18:52         ` Ingo Molnar
2009-12-23 16:03 Ingo Molnar
2009-10-08 19:01 Ingo Molnar
2009-05-05  9:35 Ingo Molnar
2009-02-17 16:40 [git pull] " Ingo Molnar
2009-02-04 19:18 Ingo Molnar
2009-01-07 22:26 Ingo Molnar
2009-01-07 23:47 ` Linus Torvalds
2009-01-08  7:50   ` Peter Zijlstra
2008-12-04 19:41 Ingo Molnar
2008-04-14 15:07 Ingo Molnar
2008-01-22 10:33 Ingo Molnar
2007-10-29 20:39 [git pull] scheduler fixes Ingo Molnar
2007-10-29 23:34 ` [git pull] scheduler fix Ingo Molnar
2007-10-30 10:15   ` Guillaume Chazarain
2007-11-01  8:39     ` Ingo Molnar

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20181117105757.GA40115@gmail.com \
    --to=mingo@kernel.org \
    --cc=a.p.zijlstra@chello.nl \
    --cc=akpm@linux-foundation.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=tglx@linutronix.de \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.