LKML Archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/9] RT: RT-Overload/Sched enhancements v4
@ 2007-10-17 18:50 Gregory Haskins
  2007-10-17 18:50 ` [PATCH 1/9] RT: push-rt Gregory Haskins
                   ` (8 more replies)
  0 siblings, 9 replies; 13+ messages in thread
From: Gregory Haskins @ 2007-10-17 18:50 UTC (permalink / raw
  To: Steven Rostedt, Peter Zijlstra; +Cc: RT, Ingo Molnar, LKML, Gregory Haskins

Applies to 23-rt1 + Steve's latest push_rt patch

Changes since v3:

1) Rebased to Steve's latest
2) Added a "highest_prio" feature to eliminate a race w.r.t. activating a task
   and the time it takes to actually reschedule the RQ.
3) Dropped the PI patch, because the highest_prio patch obsoletes it.
4) Few small tweaks
5) Few small fixes

Regards,
-Greg

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 1/9] RT: push-rt
  2007-10-17 18:50 [PATCH 0/9] RT: RT-Overload/Sched enhancements v4 Gregory Haskins
@ 2007-10-17 18:50 ` Gregory Haskins
  2007-10-17 18:50 ` [PATCH 2/9] RT: Add a per-cpu rt_overload indication Gregory Haskins
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 13+ messages in thread
From: Gregory Haskins @ 2007-10-17 18:50 UTC (permalink / raw
  To: Steven Rostedt, Peter Zijlstra; +Cc: RT, Ingo Molnar, LKML, Gregory Haskins

From: Steven Rostedt <rostedt@goodmis.org>

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---

 kernel/sched.c    |  141 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 kernel/sched_rt.c |   44 +++++++++++++++++
 2 files changed, 178 insertions(+), 7 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 3e75c62..0dabf89 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -304,6 +304,7 @@ struct rq {
 #ifdef CONFIG_PREEMPT_RT
 	unsigned long rt_nr_running;
 	unsigned long rt_nr_uninterruptible;
+	int curr_prio;
 #endif
 
 	unsigned long switch_timestamp;
@@ -1484,6 +1485,123 @@ next_in_queue:
 
 static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
 
+/* Only try this algorithm three times */
+#define RT_PUSH_MAX_TRIES 3
+
+/* Will lock the rq it finds */
+static struct rq *find_lock_lowest_rq(cpumask_t *cpu_mask,
+				      struct task_struct *task,
+				      struct rq *this_rq)
+{
+	struct rq *lowest_rq = NULL;
+	int dst_cpu = -1;
+	int cpu;
+	int tries;
+
+	for (tries = 0; tries < RT_PUSH_MAX_TRIES; tries++) {
+		/*
+		 * Scan each rq for the lowest prio.
+		 */
+		for_each_cpu_mask(cpu, *cpu_mask) {
+			struct rq *rq = &per_cpu(runqueues, cpu);
+
+			if (cpu == smp_processor_id())
+				continue;
+
+			/* We look for lowest RT prio or non-rt CPU */
+			if (rq->curr_prio >= MAX_RT_PRIO) {
+				lowest_rq = rq;
+				dst_cpu = cpu;
+				break;
+			}
+
+			/* no locking for now */
+			if (rq->curr_prio > task->prio &&
+			    (!lowest_rq || rq->curr_prio < lowest_rq->curr_prio)) {
+				lowest_rq = rq;
+				dst_cpu = cpu;
+			}
+		}
+
+		if (!lowest_rq)
+			break;
+
+		/* if the prio of this runqueue changed, try again */
+		if (double_lock_balance(this_rq, lowest_rq)) {
+			/*
+			 * We had to unlock the run queue. In
+			 * the mean time, task could have
+			 * migrated already or had its affinity changed.
+			 */
+			if (unlikely(task_rq(task) != this_rq ||
+				     !cpu_isset(dst_cpu, task->cpus_allowed))) {
+				spin_unlock(&lowest_rq->lock);
+				lowest_rq = NULL;
+				break;
+			}
+
+		}
+
+		/* If this rq is still suitable use it. */
+		if (lowest_rq->curr_prio > task->prio)
+			break;
+
+		/* try again */
+		spin_unlock(&lowest_rq->lock);
+		lowest_rq = NULL;
+	}
+
+	return lowest_rq;
+}
+
+/*
+ * If the current CPU has more than one RT task, see if the non
+ * running task can migrate over to a CPU that is running a task
+ * of lesser priority.
+ */
+static int push_rt_task(struct rq *this_rq)
+{
+	struct task_struct *next_task;
+	struct rq *lowest_rq;
+	int dst_cpu;
+	int ret = 0;
+	cpumask_t cpu_mask;
+
+	assert_spin_locked(&this_rq->lock);
+
+	next_task = rt_next_highest_task(this_rq);
+	if (!next_task)
+		return 0;
+
+	cpus_and(cpu_mask, cpu_online_map, next_task->cpus_allowed);
+
+	/* We might release this_rq lock */
+	get_task_struct(next_task);
+
+	/* find_lock_lowest_rq locks the rq if found */
+	lowest_rq = find_lock_lowest_rq(&cpu_mask, next_task, this_rq);
+	if (!lowest_rq)
+		goto out;
+
+	dst_cpu = lowest_rq->cpu;
+
+	assert_spin_locked(&lowest_rq->lock);
+
+	deactivate_task(this_rq, next_task, 0);
+	set_task_cpu(next_task, dst_cpu);
+	activate_task(lowest_rq, next_task, 0);
+
+	resched_task(lowest_rq->curr);
+
+	spin_unlock(&lowest_rq->lock);
+
+	ret = 1;
+out:
+	put_task_struct(next_task);
+
+	return ret;
+}
+
 /*
  * Pull RT tasks from other CPUs in the RT-overload
  * case. Interrupts are disabled, local rq is locked.
@@ -2202,19 +2320,28 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	 * be dropped twice.
 	 *		Manfred Spraul <manfred@colorfullife.com>
 	 */
+	prev_state = prev->state;
+	_finish_arch_switch(prev);
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+	rq->curr_prio = current->prio;
+#endif
+	finish_lock_switch(rq, prev);
 #if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
 	/*
 	 * If we pushed an RT task off the runqueue,
-	 * then kick other CPUs, they might run it:
+	 * then kick other CPUs, they might run it.
+	 * Note we may release the rq lock, and since
+	 * the lock was owned by prev, we need to release it
+	 * first via finish_lock_switch and then reaquire it.
 	 */
-	if (unlikely(rt_task(current) && rq->rt_nr_running > 1)) {
-		schedstat_inc(rq, rto_schedule);
-		smp_send_reschedule_allbutself_cpumask(current->cpus_allowed);
+	if (unlikely(rt_task(current))) {
+		spin_lock(&rq->lock);
+		/* push_rt_task will return true if it moved an RT */
+		while (push_rt_task(rq))
+			;
+		spin_unlock(&rq->lock);
 	}
 #endif
-	prev_state = prev->state;
-	_finish_arch_switch(prev);
-	finish_lock_switch(rq, prev);
 	fire_sched_in_preempt_notifiers(current);
 	trace_stop_sched_switched(current);
 	/*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 369827b..8d59e62 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -96,6 +96,50 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
 	return next;
 }
 
+#ifdef CONFIG_PREEMPT_RT
+static struct task_struct *rt_next_highest_task(struct rq *rq)
+{
+	struct rt_prio_array *array = &rq->rt.active;
+	struct task_struct *next;
+	struct list_head *queue;
+	int idx;
+
+	if (likely (rq->rt_nr_running < 2))
+		return NULL;
+
+	idx = sched_find_first_bit(array->bitmap);
+	if (idx >= MAX_RT_PRIO) {
+		WARN_ON(1); /* rt_nr_running is bad */
+		return NULL;
+	}
+
+	queue = array->queue + idx;
+	next = list_entry(queue->next, struct task_struct, run_list);
+	if (unlikely(next != current))
+		return next;
+
+	if (queue->next->next != queue) {
+		/* same prio task */
+		next = list_entry(queue->next->next, struct task_struct, run_list);
+		goto out;
+	}
+
+	/* slower, but more flexible */
+	idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
+	if (idx >= MAX_RT_PRIO) {
+		WARN_ON(1); /* rt_nr_running was 2 and above! */
+		return NULL;
+	}
+
+	queue = array->queue + idx;
+	next = list_entry(queue->next, struct task_struct, run_list);
+
+ out:
+	return next;
+
+}
+#endif /* CONFIG_PREEMPT_RT */
+
 static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 {
 	update_curr_rt(rq);


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 2/9] RT: Add a per-cpu rt_overload indication
  2007-10-17 18:50 [PATCH 0/9] RT: RT-Overload/Sched enhancements v4 Gregory Haskins
  2007-10-17 18:50 ` [PATCH 1/9] RT: push-rt Gregory Haskins
@ 2007-10-17 18:50 ` Gregory Haskins
  2007-10-17 18:50 ` [PATCH 3/9] RT: Wrap the RQ notion of priority to make it conditional Gregory Haskins
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 13+ messages in thread
From: Gregory Haskins @ 2007-10-17 18:50 UTC (permalink / raw
  To: Steven Rostedt, Peter Zijlstra; +Cc: RT, Ingo Molnar, LKML, Gregory Haskins

The system currently evaluates all online CPUs whenever one or more enters
an rt_overload condition.  This suffers from scalability limitations as
the # of online CPUs increases.  So we introduce a cpumask to track
exactly which CPUs need RT balancing.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
CC: Peter W. Morreale <pmorreale@novell.com>
---

 kernel/sched.c |   12 +++++++++---
 1 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 0dabf89..0da8c30 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -632,6 +632,7 @@ static inline struct rq *this_rq_lock(void)
 
 #if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
 static __cacheline_aligned_in_smp atomic_t rt_overload;
+static cpumask_t rto_cpus;
 #endif
 
 static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
@@ -640,8 +641,11 @@ static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
 	if (rt_task(p)) {
 		rq->rt_nr_running++;
 # ifdef CONFIG_SMP
-		if (rq->rt_nr_running == 2)
+		if (rq->rt_nr_running == 2) {
+			cpu_set(rq->cpu, rto_cpus);
+			smp_wmb();
 			atomic_inc(&rt_overload);
+		}
 # endif
 	}
 #endif
@@ -654,8 +658,10 @@ static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
 		WARN_ON(!rq->rt_nr_running);
 		rq->rt_nr_running--;
 # ifdef CONFIG_SMP
-		if (rq->rt_nr_running == 1)
+		if (rq->rt_nr_running == 1) {
 			atomic_dec(&rt_overload);
+			cpu_clear(rq->cpu, rto_cpus);
+		}
 # endif
 	}
 #endif
@@ -1621,7 +1627,7 @@ static void balance_rt_tasks(struct rq *this_rq, int this_cpu)
 	 */
 	next = pick_next_task(this_rq, this_rq->curr);
 
-	for_each_online_cpu(cpu) {
+	for_each_cpu_mask(cpu, rto_cpus) {
 		if (cpu == this_cpu)
 			continue;
 		src_rq = cpu_rq(cpu);


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 3/9] RT: Wrap the RQ notion of priority to make it conditional
  2007-10-17 18:50 [PATCH 0/9] RT: RT-Overload/Sched enhancements v4 Gregory Haskins
  2007-10-17 18:50 ` [PATCH 1/9] RT: push-rt Gregory Haskins
  2007-10-17 18:50 ` [PATCH 2/9] RT: Add a per-cpu rt_overload indication Gregory Haskins
@ 2007-10-17 18:50 ` Gregory Haskins
  2007-10-17 18:50 ` [PATCH 4/9] RT: Initialize the priority value Gregory Haskins
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 13+ messages in thread
From: Gregory Haskins @ 2007-10-17 18:50 UTC (permalink / raw
  To: Steven Rostedt, Peter Zijlstra; +Cc: RT, Ingo Molnar, LKML, Gregory Haskins

A little cleanup to avoid #ifdef proliferation later in the series

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 kernel/sched.c |   16 +++++++++++++---
 1 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 0da8c30..131f618 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -365,6 +365,16 @@ struct rq {
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 static DEFINE_MUTEX(sched_hotcpu_mutex);
 
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+static inline void set_rq_prio(struct rq *rq, int prio)
+{
+	rq->curr_prio = prio;
+}
+
+#else
+#define set_rq_prio(rq, prio) do { } while(0)
+#endif
+
 static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
 {
 	rq->curr->sched_class->check_preempt_curr(rq, p);
@@ -2328,9 +2338,9 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	 */
 	prev_state = prev->state;
 	_finish_arch_switch(prev);
-#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
-	rq->curr_prio = current->prio;
-#endif
+
+	set_rq_prio(rq, current->prio);
+
 	finish_lock_switch(rq, prev);
 #if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
 	/*


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 4/9] RT: Initialize the priority value
  2007-10-17 18:50 [PATCH 0/9] RT: RT-Overload/Sched enhancements v4 Gregory Haskins
                   ` (2 preceding siblings ...)
  2007-10-17 18:50 ` [PATCH 3/9] RT: Wrap the RQ notion of priority to make it conditional Gregory Haskins
@ 2007-10-17 18:50 ` Gregory Haskins
  2007-10-17 18:50 ` [PATCH 5/9] RT: Maintain the highest RQ priority Gregory Haskins
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 13+ messages in thread
From: Gregory Haskins @ 2007-10-17 18:50 UTC (permalink / raw
  To: Steven Rostedt, Peter Zijlstra; +Cc: RT, Ingo Molnar, LKML, Gregory Haskins

We should init the base value of the current RQ priority to "IDLE"

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 kernel/sched.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 131f618..d68f600 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7385,6 +7385,8 @@ void __init sched_init(void)
 		highest_cpu = i;
 		/* delimiter for bitsearch: */
 		__set_bit(MAX_RT_PRIO, array->bitmap);
+
+		set_rq_prio(rq, MAX_PRIO);
 	}
 
 	set_load_weight(&init_task);


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 5/9] RT: Maintain the highest RQ priority
  2007-10-17 18:50 [PATCH 0/9] RT: RT-Overload/Sched enhancements v4 Gregory Haskins
                   ` (3 preceding siblings ...)
  2007-10-17 18:50 ` [PATCH 4/9] RT: Initialize the priority value Gregory Haskins
@ 2007-10-17 18:50 ` Gregory Haskins
  2007-10-17 18:50 ` [PATCH 6/9] RT: Clean up some of the push-rt logic Gregory Haskins
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 13+ messages in thread
From: Gregory Haskins @ 2007-10-17 18:50 UTC (permalink / raw
  To: Steven Rostedt, Peter Zijlstra; +Cc: RT, Ingo Molnar, LKML, Gregory Haskins

This is an implementation of Steve's idea where we should update the RQ
concept of priority to show the highest-task, even if that task is not (yet)
running.  This prevents us from pushing multiple tasks to the RQ before it
gets a chance to reschedule.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 kernel/sched.c |   37 ++++++++++++++++++++++++++++---------
 1 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index d68f600..67034aa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -304,7 +304,7 @@ struct rq {
 #ifdef CONFIG_PREEMPT_RT
 	unsigned long rt_nr_running;
 	unsigned long rt_nr_uninterruptible;
-	int curr_prio;
+	int highest_prio;
 #endif
 
 	unsigned long switch_timestamp;
@@ -368,11 +368,23 @@ static DEFINE_MUTEX(sched_hotcpu_mutex);
 #if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
 static inline void set_rq_prio(struct rq *rq, int prio)
 {
-	rq->curr_prio = prio;
+	rq->highest_prio = prio;
+}
+
+static inline void update_rq_prio(struct rq *rq)
+{
+	struct rt_prio_array *array = &rq->rt.active;
+	int prio = MAX_PRIO;
+
+	if (rq->nr_running)
+		prio = sched_find_first_bit(array->bitmap);
+
+	set_rq_prio(rq, prio);
 }
 
 #else
 #define set_rq_prio(rq, prio) do { } while(0)
+#define update_rq_prio(rq)    do { } while(0)
 #endif
 
 static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
@@ -1023,12 +1035,14 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
 	sched_info_queued(p);
 	p->sched_class->enqueue_task(rq, p, wakeup);
 	p->se.on_rq = 1;
+	update_rq_prio(rq);
 }
 
 static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
 {
 	p->sched_class->dequeue_task(rq, p, sleep);
 	p->se.on_rq = 0;
+	update_rq_prio(rq);
 }
 
 /*
@@ -1525,15 +1539,15 @@ static struct rq *find_lock_lowest_rq(cpumask_t *cpu_mask,
 				continue;
 
 			/* We look for lowest RT prio or non-rt CPU */
-			if (rq->curr_prio >= MAX_RT_PRIO) {
+			if (rq->highest_prio >= MAX_RT_PRIO) {
 				lowest_rq = rq;
 				dst_cpu = cpu;
 				break;
 			}
 
 			/* no locking for now */
-			if (rq->curr_prio > task->prio &&
-			    (!lowest_rq || rq->curr_prio < lowest_rq->curr_prio)) {
+			if (rq->highest_prio > task->prio &&
+			    (!lowest_rq || rq->highest_prio < lowest_rq->highest_prio)) {
 				lowest_rq = rq;
 				dst_cpu = cpu;
 			}
@@ -1559,7 +1573,7 @@ static struct rq *find_lock_lowest_rq(cpumask_t *cpu_mask,
 		}
 
 		/* If this rq is still suitable use it. */
-		if (lowest_rq->curr_prio > task->prio)
+		if (lowest_rq->highest_prio > task->prio)
 			break;
 
 		/* try again */
@@ -2338,10 +2352,8 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	 */
 	prev_state = prev->state;
 	_finish_arch_switch(prev);
-
-	set_rq_prio(rq, current->prio);
-
 	finish_lock_switch(rq, prev);
+
 #if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
 	/*
 	 * If we pushed an RT task off the runqueue,
@@ -4646,6 +4658,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	prev_resched = _need_resched();
 
 	if (on_rq) {
+		/*
+		 * Note: RQ priority gets updated in the enqueue/dequeue logic
+		 */
 		enqueue_task(rq, p, 0);
 		/*
 		 * Reschedule if we are currently running on this runqueue and
@@ -4712,6 +4727,10 @@ void set_user_nice(struct task_struct *p, long nice)
 		 */
 		if (delta < 0 || (delta > 0 && task_running(rq, p)))
 			resched_task(rq->curr);
+
+		/*
+		 * Note: RQ priority gets updated in the enqueue/dequeue logic
+		 */
 	}
 out_unlock:
 	task_rq_unlock(rq, &flags);


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 6/9] RT: Clean up some of the push-rt logic
  2007-10-17 18:50 [PATCH 0/9] RT: RT-Overload/Sched enhancements v4 Gregory Haskins
                   ` (4 preceding siblings ...)
  2007-10-17 18:50 ` [PATCH 5/9] RT: Maintain the highest RQ priority Gregory Haskins
@ 2007-10-17 18:50 ` Gregory Haskins
  2007-10-17 18:51 ` [PATCH 7/9] RT: Add support for low-priority wake-up to push_rt feature Gregory Haskins
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 13+ messages in thread
From: Gregory Haskins @ 2007-10-17 18:50 UTC (permalink / raw
  To: Steven Rostedt, Peter Zijlstra; +Cc: RT, Ingo Molnar, LKML, Gregory Haskins

Get rid of the superfluous dst_cpu, and move the cpu_mask inside the search
function.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 kernel/sched.c |   18 +++++++-----------
 1 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 67034aa..d604484 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1519,20 +1519,21 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
 #define RT_PUSH_MAX_TRIES 3
 
 /* Will lock the rq it finds */
-static struct rq *find_lock_lowest_rq(cpumask_t *cpu_mask,
-				      struct task_struct *task,
+static struct rq *find_lock_lowest_rq(struct task_struct *task,
 				      struct rq *this_rq)
 {
 	struct rq *lowest_rq = NULL;
-	int dst_cpu = -1;
 	int cpu;
 	int tries;
+	cpumask_t cpu_mask;
+
+	cpus_and(cpu_mask, cpu_online_map, task->cpus_allowed);
 
 	for (tries = 0; tries < RT_PUSH_MAX_TRIES; tries++) {
 		/*
 		 * Scan each rq for the lowest prio.
 		 */
-		for_each_cpu_mask(cpu, *cpu_mask) {
+		for_each_cpu_mask(cpu, cpu_mask) {
 			struct rq *rq = &per_cpu(runqueues, cpu);
 
 			if (cpu == smp_processor_id())
@@ -1541,7 +1542,6 @@ static struct rq *find_lock_lowest_rq(cpumask_t *cpu_mask,
 			/* We look for lowest RT prio or non-rt CPU */
 			if (rq->highest_prio >= MAX_RT_PRIO) {
 				lowest_rq = rq;
-				dst_cpu = cpu;
 				break;
 			}
 
@@ -1549,7 +1549,6 @@ static struct rq *find_lock_lowest_rq(cpumask_t *cpu_mask,
 			if (rq->highest_prio > task->prio &&
 			    (!lowest_rq || rq->highest_prio < lowest_rq->highest_prio)) {
 				lowest_rq = rq;
-				dst_cpu = cpu;
 			}
 		}
 
@@ -1564,7 +1563,7 @@ static struct rq *find_lock_lowest_rq(cpumask_t *cpu_mask,
 			 * migrated already or had its affinity changed.
 			 */
 			if (unlikely(task_rq(task) != this_rq ||
-				     !cpu_isset(dst_cpu, task->cpus_allowed))) {
+				     !cpu_isset(lowest_rq->cpu, task->cpus_allowed))) {
 				spin_unlock(&lowest_rq->lock);
 				lowest_rq = NULL;
 				break;
@@ -1595,7 +1594,6 @@ static int push_rt_task(struct rq *this_rq)
 	struct rq *lowest_rq;
 	int dst_cpu;
 	int ret = 0;
-	cpumask_t cpu_mask;
 
 	assert_spin_locked(&this_rq->lock);
 
@@ -1603,13 +1601,11 @@ static int push_rt_task(struct rq *this_rq)
 	if (!next_task)
 		return 0;
 
-	cpus_and(cpu_mask, cpu_online_map, next_task->cpus_allowed);
-
 	/* We might release this_rq lock */
 	get_task_struct(next_task);
 
 	/* find_lock_lowest_rq locks the rq if found */
-	lowest_rq = find_lock_lowest_rq(&cpu_mask, next_task, this_rq);
+	lowest_rq = find_lock_lowest_rq(next_task, this_rq);
 	if (!lowest_rq)
 		goto out;
 


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 7/9] RT: Add support for low-priority wake-up to push_rt feature
  2007-10-17 18:50 [PATCH 0/9] RT: RT-Overload/Sched enhancements v4 Gregory Haskins
                   ` (5 preceding siblings ...)
  2007-10-17 18:50 ` [PATCH 6/9] RT: Clean up some of the push-rt logic Gregory Haskins
@ 2007-10-17 18:51 ` Gregory Haskins
  2007-10-17 18:51 ` [PATCH 8/9] RT: Fixes for push-rt patch Gregory Haskins
  2007-10-17 18:51 ` [PATCH 9/9] RT: Only dirty a cacheline if the priority is actually changing Gregory Haskins
  8 siblings, 0 replies; 13+ messages in thread
From: Gregory Haskins @ 2007-10-17 18:51 UTC (permalink / raw
  To: Steven Rostedt, Peter Zijlstra; +Cc: RT, Ingo Molnar, LKML, Gregory Haskins

There are three events that require consideration for redistributing RT
tasks:

1) When one or more higher-priority tasks preempts a lower-one from a
   RQ
2) When a lower-priority task is woken up on a RQ
3) When a RQ downgrades its current priority

Steve Rostedt's push_rt patch addresses (1).  It hooks in right after
a new task has been switched-in.  If this was the result of an RT
preemption, or if more than one task was awoken at the same time, we
can try to push some of those other tasks away.

This patch addresses (2).  When we wake up a task, we check to see
if it would preempt the current task on the queue.  If it will not, we
attempt to find a better suited CPU (e.g. one running something lower
priority than the task being woken) and try to activate the task there.

Finally, we have (3).  In theory, we only need to balance_rt_tasks() if
the following conditions are met:
   1) One or more CPUs are in overload, AND
   2) We are about to switch to a task that lowers our priority.

(3) will be addressed in a later patch.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 kernel/sched.c |   88 ++++++++++++++++++++++++++------------------------------
 1 files changed, 41 insertions(+), 47 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index d604484..0ee1e21 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1628,6 +1628,13 @@ out:
 	return ret;
 }
 
+/* Push all tasks that we can to other CPUs */
+static void push_rt_tasks(struct rq *this_rq)
+{
+	while (push_rt_task(this_rq))
+		;
+}
+
 /*
  * Pull RT tasks from other CPUs in the RT-overload
  * case. Interrupts are disabled, local rq is locked.
@@ -1988,6 +1995,25 @@ out_set_cpu:
 		this_cpu = smp_processor_id();
 		cpu = task_cpu(p);
 	}
+	
+#if defined(CONFIG_PREEMPT_RT)
+	/*
+	 * If a newly woken up RT task cannot preempt the
+	 * current (RT) task (on a target runqueue) then try
+	 * to find another CPU it can preempt:
+	 */
+	if (rt_task(p) && !TASK_PREEMPTS_CURR(p, rq)) {
+		struct rq *lowest_rq = find_lock_lowest_rq(p, rq);
+		if (lowest_rq) {
+			set_task_cpu(p, lowest_rq->cpu);
+			spin_unlock(&rq->lock);
+
+			/* The new lock was already acquired in find_lowest */ 
+			rq  = lowest_rq;
+			cpu = task_cpu(p);
+		}
+	}
+#endif /* defined(CONFIG_PREEMPT_RT) */
 
 out_activate:
 #endif /* CONFIG_SMP */
@@ -1997,51 +2023,20 @@ out_activate:
 	trace_start_sched_wakeup(p, rq);
 
 	/*
-	 * If a newly woken up RT task cannot preempt the
-	 * current (RT) task (on a target runqueue) then try
-	 * to find another CPU it can preempt:
+	 * Sync wakeups (i.e. those types of wakeups where the waker
+	 * has indicated that it will leave the CPU in short order)
+	 * don't trigger a preemption, if the woken up task will run on
+	 * this cpu. (in this case the 'I will reschedule' promise of
+	 * the waker guarantees that the freshly woken up task is going
+	 * to be considered on this CPU.)
 	 */
-	if (rt_task(p) && !TASK_PREEMPTS_CURR(p, rq)) {
-		struct rq *this_rq = cpu_rq(this_cpu);
-		/*
-		 * Special-case: the task on this CPU can be
-		 * preempted. In that case there's no need to
-		 * trigger reschedules on other CPUs, we can
-		 * mark the current task for reschedule.
-		 *
-		 * (Note that it's safe to access this_rq without
-		 * extra locking in this particular case, because
-		 * we are on the current CPU.)
-		 */
-		if (TASK_PREEMPTS_CURR(p, this_rq))
-			set_tsk_need_resched(this_rq->curr);
-		else
-			/*
-			 * Neither the intended target runqueue
-			 * nor the current CPU can take this task.
-			 * Trigger a reschedule on all other CPUs
-			 * nevertheless, maybe one of them can take
-			 * this task:
-			 */
-			smp_send_reschedule_allbutself_cpumask(p->cpus_allowed);
-
-		schedstat_inc(this_rq, rto_wakeup);
-	} else {
-		/*
-		 * Sync wakeups (i.e. those types of wakeups where the waker
-		 * has indicated that it will leave the CPU in short order)
-		 * don't trigger a preemption, if the woken up task will run on
-		 * this cpu. (in this case the 'I will reschedule' promise of
-		 * the waker guarantees that the freshly woken up task is going
-		 * to be considered on this CPU.)
-		 */
-		if (!sync || cpu != this_cpu)
-			check_preempt_curr(rq, p);
-		else {
-			if (TASK_PREEMPTS_CURR(p, rq))
-				set_tsk_need_resched_delayed(rq->curr);
-		}
+	if (!sync || cpu != this_cpu)
+		check_preempt_curr(rq, p);
+	else {
+		if (TASK_PREEMPTS_CURR(p, rq))
+			set_tsk_need_resched_delayed(rq->curr);
 	}
+
 	if (rq->curr && p && rq && _need_resched())
 		trace_special_pid(p->pid, PRIO(p), PRIO(rq->curr));
 
@@ -2358,13 +2353,12 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	 * the lock was owned by prev, we need to release it
 	 * first via finish_lock_switch and then reaquire it.
 	 */
-	if (unlikely(rt_task(current))) {
+	if (unlikely(rq->rt_nr_running > 1)) {
 		spin_lock(&rq->lock);
-		/* push_rt_task will return true if it moved an RT */
-		while (push_rt_task(rq))
-			;
+		push_rt_tasks(rq);
 		spin_unlock(&rq->lock);
 	}
+
 #endif
 	fire_sched_in_preempt_notifiers(current);
 	trace_stop_sched_switched(current);


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 8/9] RT: Fixes for push-rt patch
  2007-10-17 18:50 [PATCH 0/9] RT: RT-Overload/Sched enhancements v4 Gregory Haskins
                   ` (6 preceding siblings ...)
  2007-10-17 18:51 ` [PATCH 7/9] RT: Add support for low-priority wake-up to push_rt feature Gregory Haskins
@ 2007-10-17 18:51 ` Gregory Haskins
  2007-10-17 18:51 ` [PATCH 9/9] RT: Only dirty a cacheline if the priority is actually changing Gregory Haskins
  8 siblings, 0 replies; 13+ messages in thread
From: Gregory Haskins @ 2007-10-17 18:51 UTC (permalink / raw
  To: Steven Rostedt, Peter Zijlstra; +Cc: RT, Ingo Molnar, LKML, Gregory Haskins

From: Steven Rostedt <rostedt@goodmis.org>

Steve found these errors in the original patch

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 kernel/sched.c    |    2 +-
 kernel/sched_rt.c |    2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 0ee1e21..8c916de 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1536,7 +1536,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task,
 		for_each_cpu_mask(cpu, cpu_mask) {
 			struct rq *rq = &per_cpu(runqueues, cpu);
 
-			if (cpu == smp_processor_id())
+			if (cpu == this_rq->cpu)
 				continue;
 
 			/* We look for lowest RT prio or non-rt CPU */
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 8d59e62..04959fe 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -115,7 +115,7 @@ static struct task_struct *rt_next_highest_task(struct rq *rq)
 
 	queue = array->queue + idx;
 	next = list_entry(queue->next, struct task_struct, run_list);
-	if (unlikely(next != current))
+	if (unlikely(next != rq->curr))
 		return next;
 
 	if (queue->next->next != queue) {


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 9/9] RT: Only dirty a cacheline if the priority is actually changing
  2007-10-17 18:50 [PATCH 0/9] RT: RT-Overload/Sched enhancements v4 Gregory Haskins
                   ` (7 preceding siblings ...)
  2007-10-17 18:51 ` [PATCH 8/9] RT: Fixes for push-rt patch Gregory Haskins
@ 2007-10-17 18:51 ` Gregory Haskins
  2007-10-20  2:48   ` Roel Kluin
  8 siblings, 1 reply; 13+ messages in thread
From: Gregory Haskins @ 2007-10-17 18:51 UTC (permalink / raw
  To: Steven Rostedt, Peter Zijlstra; +Cc: RT, Ingo Molnar, LKML, Gregory Haskins

We can avoid dirtying a rq related cacheline with a simple check, so why not.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 0 files changed, 0 insertions(+), 0 deletions(-)



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 9/9] RT: Only dirty a cacheline if the priority is actually changing
  2007-10-17 18:51 ` [PATCH 9/9] RT: Only dirty a cacheline if the priority is actually changing Gregory Haskins
@ 2007-10-20  2:48   ` Roel Kluin
  2007-10-20  7:54     ` Steven Rostedt
  2007-10-20 10:44     ` Gregory Haskins
  0 siblings, 2 replies; 13+ messages in thread
From: Roel Kluin @ 2007-10-20  2:48 UTC (permalink / raw
  To: Gregory Haskins; +Cc: Steven Rostedt, Peter Zijlstra, RT, Ingo Molnar, LKML

Gregory Haskins wrote:
> We can avoid dirtying a rq related cacheline with a simple check, so why not.
> 
> Signed-off-by: Gregory Haskins <ghaskins@novell.com>
> ---
> 
>  0 files changed, 0 insertions(+), 0 deletions(-)

I think you wanted a patch here?

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 9/9] RT: Only dirty a cacheline if the priority is actually changing
  2007-10-20  2:48   ` Roel Kluin
@ 2007-10-20  7:54     ` Steven Rostedt
  2007-10-20 10:44     ` Gregory Haskins
  1 sibling, 0 replies; 13+ messages in thread
From: Steven Rostedt @ 2007-10-20  7:54 UTC (permalink / raw
  To: Roel Kluin; +Cc: Gregory Haskins, Peter Zijlstra, RT, Ingo Molnar, LKML


--
On Sat, 20 Oct 2007, Roel Kluin wrote:

> Gregory Haskins wrote:
> > We can avoid dirtying a rq related cacheline with a simple check, so why not.
> >
> > Signed-off-by: Gregory Haskins <ghaskins@novell.com>
> > ---
> >
> >  0 files changed, 0 insertions(+), 0 deletions(-)
>
> I think you wanted a patch here?
>

But it is here.  Gregory is a Zen master, and this patch does exactly what
he wanted it to do.

-- Steve


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 9/9] RT: Only dirty a cacheline if the priority is actually changing
  2007-10-20  2:48   ` Roel Kluin
  2007-10-20  7:54     ` Steven Rostedt
@ 2007-10-20 10:44     ` Gregory Haskins
  1 sibling, 0 replies; 13+ messages in thread
From: Gregory Haskins @ 2007-10-20 10:44 UTC (permalink / raw
  To: Roel Kluin; +Cc: Steven Rostedt, Peter Zijlstra, RT, Ingo Molnar, LKML

[-- Attachment #1: Type: text/plain, Size: 651 bytes --]

On Sat, 2007-10-20 at 04:48 +0200, Roel Kluin wrote:
> Gregory Haskins wrote:
> > We can avoid dirtying a rq related cacheline with a simple check, so why not.
> > 
> > Signed-off-by: Gregory Haskins <ghaskins@novell.com>
> > ---
> > 
> >  0 files changed, 0 insertions(+), 0 deletions(-)
> 
> I think you wanted a patch here?

Hi Roel,
  I had forgotten to refresh before mailing the patches, but I sent an
immediate followup (which unfortunately was not linked to the original
posting.  For your reference, here is the reposting:

http://article.gmane.org/gmane.linux.rt.user/1626

Sorry for the confusion!

Regards,
-Greg



[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2007-10-20 10:47 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-10-17 18:50 [PATCH 0/9] RT: RT-Overload/Sched enhancements v4 Gregory Haskins
2007-10-17 18:50 ` [PATCH 1/9] RT: push-rt Gregory Haskins
2007-10-17 18:50 ` [PATCH 2/9] RT: Add a per-cpu rt_overload indication Gregory Haskins
2007-10-17 18:50 ` [PATCH 3/9] RT: Wrap the RQ notion of priority to make it conditional Gregory Haskins
2007-10-17 18:50 ` [PATCH 4/9] RT: Initialize the priority value Gregory Haskins
2007-10-17 18:50 ` [PATCH 5/9] RT: Maintain the highest RQ priority Gregory Haskins
2007-10-17 18:50 ` [PATCH 6/9] RT: Clean up some of the push-rt logic Gregory Haskins
2007-10-17 18:51 ` [PATCH 7/9] RT: Add support for low-priority wake-up to push_rt feature Gregory Haskins
2007-10-17 18:51 ` [PATCH 8/9] RT: Fixes for push-rt patch Gregory Haskins
2007-10-17 18:51 ` [PATCH 9/9] RT: Only dirty a cacheline if the priority is actually changing Gregory Haskins
2007-10-20  2:48   ` Roel Kluin
2007-10-20  7:54     ` Steven Rostedt
2007-10-20 10:44     ` Gregory Haskins

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).