All the mail mirrored from lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] idle, thermal, acpi: Remove home grown idle implementations
@ 2014-06-04  8:54 Peter Zijlstra
  2014-06-04  8:58 ` Jacob Pan
  0 siblings, 1 reply; 7+ messages in thread
From: Peter Zijlstra @ 2014-06-04  8:54 UTC (permalink / raw
  To: rafael.j.wysocki, jacob.jun.pan
  Cc: linux-kernel, linux-pm, lenb, mingo, tglx, hpa, arjan, rui.zhang,
	luto

[-- Attachment #1: Type: text/plain, Size: 12701 bytes --]


I'm still sitting on this patch. Jacub you were going to make it play
nice with QoS?

---
Subject: idle, thermal, acpi: Remove home grown idle implementations
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed Nov 20 14:32:37 CET 2013

People are starting to grow their own idle implementations in various
disgusting ways. Collapse the lot and use the generic idle code to
provide a proper idle cycle implementation.

This does not fully preseve existing behaviour in that the generic
idle cycle function calls into the normal cpuidle governed idle
routines and should thus respect things like QoS parameters and the
like.

If people want to over-ride the idle state they should talk to the
cpuidle folks about extending the interface and attempt to preserve
QoS guarantees, instead of jumping straight to the deepest possible C
state -- Jacub Pan said he was going to do this.

This is reported to work for intel_powerclamp by Jacub Pan, the
acpi_pad driver is untested.

Cc: rui.zhang@intel.com
Cc: jacob.jun.pan@linux.intel.com
Cc: lenb@kernel.org
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: hpa@zytor.com
Cc: arjan@linux.intel.com
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
---
 drivers/acpi/acpi_pad.c            |   41 -----------
 drivers/thermal/intel_powerclamp.c |   38 ----------
 include/linux/cpu.h                |    2 
 include/linux/sched.h              |    3 
 kernel/sched/core.c                |    9 ++
 kernel/sched/idle.c                |  129 ++++++++++++++++++++++---------------
 kernel/time/tick-sched.c           |    2 
 7 files changed, 95 insertions(+), 129 deletions(-)

--- a/drivers/acpi/acpi_pad.c
+++ b/drivers/acpi/acpi_pad.c
@@ -40,9 +40,7 @@ static DEFINE_MUTEX(round_robin_lock);
 static unsigned long power_saving_mwait_eax;
 
 static unsigned char tsc_detected_unstable;
-static unsigned char tsc_marked_unstable;
 static unsigned char lapic_detected_unstable;
-static unsigned char lapic_marked_unstable;
 
 static void power_saving_mwait_init(void)
 {
@@ -152,10 +150,9 @@ static int power_saving_thread(void *dat
 	unsigned int tsk_index = (unsigned long)data;
 	u64 last_jiffies = 0;
 
-	sched_setscheduler(current, SCHED_RR, &param);
+	sched_setscheduler(current, SCHED_FIFO, &param);
 
 	while (!kthread_should_stop()) {
-		int cpu;
 		u64 expire_time;
 
 		try_to_freeze();
@@ -170,41 +167,7 @@ static int power_saving_thread(void *dat
 
 		expire_time = jiffies + HZ * (100 - idle_pct) / 100;
 
-		while (!need_resched()) {
-			if (tsc_detected_unstable && !tsc_marked_unstable) {
-				/* TSC could halt in idle, so notify users */
-				mark_tsc_unstable("TSC halts in idle");
-				tsc_marked_unstable = 1;
-			}
-			if (lapic_detected_unstable && !lapic_marked_unstable) {
-				int i;
-				/* LAPIC could halt in idle, so notify users */
-				for_each_online_cpu(i)
-					clockevents_notify(
-						CLOCK_EVT_NOTIFY_BROADCAST_ON,
-						&i);
-				lapic_marked_unstable = 1;
-			}
-			local_irq_disable();
-			cpu = smp_processor_id();
-			if (lapic_marked_unstable)
-				clockevents_notify(
-					CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
-			stop_critical_timings();
-
-			mwait_idle_with_hints(power_saving_mwait_eax, 1);
-
-			start_critical_timings();
-			if (lapic_marked_unstable)
-				clockevents_notify(
-					CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
-			local_irq_enable();
-
-			if (jiffies > expire_time) {
-				do_sleep = 1;
-				break;
-			}
-		}
+		play_idle(expire_time);
 
 		/*
 		 * current sched_rt has threshold for rt task running time.
--- a/drivers/thermal/intel_powerclamp.c
+++ b/drivers/thermal/intel_powerclamp.c
@@ -256,11 +256,6 @@ static u64 pkg_state_counter(void)
 	return count;
 }
 
-static void noop_timer(unsigned long foo)
-{
-	/* empty... just the fact that we get the interrupt wakes us up */
-}
-
 static unsigned int get_compensation(int ratio)
 {
 	unsigned int comp = 0;
@@ -365,7 +360,6 @@ static bool powerclamp_adjust_controls(u
 static int clamp_thread(void *arg)
 {
 	int cpunr = (unsigned long)arg;
-	DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0);
 	static const struct sched_param param = {
 		.sched_priority = MAX_USER_RT_PRIO/2,
 	};
@@ -374,11 +368,9 @@ static int clamp_thread(void *arg)
 
 	set_bit(cpunr, cpu_clamping_mask);
 	set_freezable();
-	init_timer_on_stack(&wakeup_timer);
 	sched_setscheduler(current, SCHED_FIFO, &param);
 
-	while (true == clamping && !kthread_should_stop() &&
-		cpu_online(cpunr)) {
+	while (clamping && !kthread_should_stop() && cpu_online(cpunr)) {
 		int sleeptime;
 		unsigned long target_jiffies;
 		unsigned int guard;
@@ -426,35 +418,11 @@ static int clamp_thread(void *arg)
 		if (should_skip)
 			continue;
 
-		target_jiffies = jiffies + duration_jiffies;
-		mod_timer(&wakeup_timer, target_jiffies);
 		if (unlikely(local_softirq_pending()))
 			continue;
-		/*
-		 * stop tick sched during idle time, interrupts are still
-		 * allowed. thus jiffies are updated properly.
-		 */
-		preempt_disable();
-		tick_nohz_idle_enter();
-		/* mwait until target jiffies is reached */
-		while (time_before(jiffies, target_jiffies)) {
-			unsigned long ecx = 1;
-			unsigned long eax = target_mwait;
-
-			/*
-			 * REVISIT: may call enter_idle() to notify drivers who
-			 * can save power during cpu idle. same for exit_idle()
-			 */
-			local_touch_nmi();
-			stop_critical_timings();
-			mwait_idle_with_hints(eax, ecx);
-			start_critical_timings();
-			atomic_inc(&idle_wakeup_counter);
-		}
-		tick_nohz_idle_exit();
-		preempt_enable();
+
+		play_idle(duration_jiffies);
 	}
-	del_timer_sync(&wakeup_timer);
 	clear_bit(cpunr, cpu_clamping_mask);
 
 	return 0;
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -255,6 +255,8 @@ enum cpuhp_state {
 	CPUHP_ONLINE,
 };
 
+void play_idle(unsigned long jiffies);
+
 void cpu_startup_entry(enum cpuhp_state state);
 void cpu_idle(void);
 
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1892,6 +1892,7 @@ extern void thread_group_cputime_adjuste
 /*
  * Per process flags
  */
+#define PF_IDLE		0x00000002	/* I am an IDLE thread */
 #define PF_EXITING	0x00000004	/* getting shut down */
 #define PF_EXITPIDONE	0x00000008	/* pi exit done on shut down */
 #define PF_VCPU		0x00000010	/* I'm a virtual CPU */
@@ -2204,7 +2205,7 @@ extern struct task_struct *idle_task(int
  */
 static inline bool is_idle_task(const struct task_struct *p)
 {
-	return p->pid == 0;
+	return !!(p->flags & PF_IDLE);
 }
 extern struct task_struct *curr_task(int cpu);
 extern void set_curr_task(int cpu, struct task_struct *p);
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -676,10 +676,12 @@ static void wake_up_idle_cpu(int cpu)
 	if (cpu == smp_processor_id())
 		return;
 
-	if (set_nr_and_not_polling(rq->idle))
+	rcu_read_lock();
+	if (set_nr_and_not_polling(rq->curr))
 		smp_send_reschedule(cpu);
 	else
 		trace_sched_wake_polling_cpu(cpu);
+	rcu_read_unlock();
 }
 
 static bool wake_up_full_nohz_cpu(int cpu)
@@ -1605,10 +1607,12 @@ static void ttwu_queue_remote(struct tas
 	struct rq *rq = cpu_rq(cpu);
 
 	if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
-		if (!set_nr_if_polling(rq->idle))
+		rcu_read_lock();
+		if (!set_nr_if_polling(rq->curr))
 			smp_send_reschedule(cpu);
 		else
 			trace_sched_wake_polling_cpu(cpu);
+		rcu_read_unlock();
 	}
 }
 
@@ -4537,6 +4541,7 @@ void init_idle(struct task_struct *idle,
 	__sched_fork(0, idle);
 	idle->state = TASK_RUNNING;
 	idle->se.exec_start = sched_clock();
+	idle->flags |= PF_IDLE;
 
 	do_set_cpus_allowed(idle, cpumask_of(cpu));
 	/*
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -184,66 +184,94 @@ static void cpuidle_idle_call(void)
  *
  * Called with polling cleared.
  */
-static void cpu_idle_loop(void)
+static void do_idle(void)
 {
-	while (1) {
-		/*
-		 * If the arch has a polling bit, we maintain an invariant:
-		 *
-		 * The polling bit is clear if we're not scheduled (i.e. if
-		 * rq->curr != rq->idle).  This means that, if rq->idle has
-		 * the polling bit set, then setting need_resched is
-		 * guaranteed to cause the cpu to reschedule.
-		 */
+	/*
+	 * If the arch has a polling bit, we maintain an invariant:
+	 *
+	 * The polling bit is clear if we're not scheduled (i.e. if rq->curr !=
+	 * rq->idle).  This means that, if rq->idle has the polling bit set,
+	 * then setting need_resched is guaranteed to cause the cpu to
+	 * reschedule.
+	 */
 
-		__current_set_polling();
-		tick_nohz_idle_enter();
+	__current_set_polling();
+	tick_nohz_idle_enter();
 
-		while (!need_resched()) {
-			check_pgt_cache();
-			rmb();
-
-			if (cpu_is_offline(smp_processor_id()))
-				arch_cpu_idle_dead();
-
-			local_irq_disable();
-			arch_cpu_idle_enter();
-
-			/*
-			 * In poll mode we reenable interrupts and spin.
-			 *
-			 * Also if we detected in the wakeup from idle
-			 * path that the tick broadcast device expired
-			 * for us, we don't want to go deep idle as we
-			 * know that the IPI is going to arrive right
-			 * away
-			 */
-			if (cpu_idle_force_poll || tick_check_broadcast_expired())
-				cpu_idle_poll();
-			else
-				cpuidle_idle_call();
+	while (!need_resched()) {
+		check_pgt_cache();
+		rmb();
 
-			arch_cpu_idle_exit();
-		}
+		if (cpu_is_offline(smp_processor_id()))
+			arch_cpu_idle_dead();
+
+		local_irq_disable();
+		arch_cpu_idle_enter();
 
 		/*
-		 * Since we fell out of the loop above, we know
-		 * TIF_NEED_RESCHED must be set, propagate it into
-		 * PREEMPT_NEED_RESCHED.
+		 * In poll mode we reenable interrupts and spin.
 		 *
-		 * This is required because for polling idle loops we will
-		 * not have had an IPI to fold the state for us.
+		 * Also if we detected in the wakeup from idle path that the
+		 * tick broadcast device expired for us, we don't want to go
+		 * deep idle as we know that the IPI is going to arrive right
+		 * away
 		 */
-		preempt_set_need_resched();
-		tick_nohz_idle_exit();
-		__current_clr_polling();
-		smp_mb__after_clear_bit();
-		sched_ttwu_pending();
-		schedule_preempt_disabled();
+		if (cpu_idle_force_poll || tick_check_broadcast_expired())
+			cpu_idle_poll();
+		else
+			cpuidle_idle_call();
+
+		arch_cpu_idle_exit();
 	}
+
+	/*
+	 * Since we fell out of the loop above, we know TIF_NEED_RESCHED must
+	 * be set, propagate it into PREEMPT_NEED_RESCHED.
+	 *
+	 * This is required because for polling idle loops we will not have had
+	 * an IPI to fold the state for us.
+	 */
+	preempt_set_need_resched();
+	tick_nohz_idle_exit();
+	__current_clr_polling();
+	smp_mb__after_clear_bit();
+	sched_ttwu_pending();
+	schedule_preempt_disabled();
+}
+
+static void play_idle_timer(unsigned long foo)
+{
+	set_tsk_need_resched(current);
+}
+
+void play_idle(unsigned long duration)
+{
+	DEFINE_TIMER(wakeup_timer, play_idle_timer, 0, 0);
+
+	/*
+	 * Only FIFO tasks can disable the tick since they don't need the forced
+	 * preemption.
+	 */
+	WARN_ON_ONCE(current->policy != SCHED_FIFO);
+	WARN_ON_ONCE(current->nr_cpus_allowed != 1);
+	WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY));
+	WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
+	rcu_sleep_check();
+
+	init_timer_on_stack(&wakeup_timer);
+	mod_timer_pinned(&wakeup_timer, jiffies + duration);
+
+	preempt_disable();
+	current->flags |= PF_IDLE;
+	do_idle();
+	current->flags &= ~PF_IDLE;
+	del_timer_sync(&wakeup_timer);
+	preempt_fold_need_resched();
+	preempt_enable();
 }
+EXPORT_SYMBOL_GPL(play_idle);
 
-void cpu_startup_entry(enum cpuhp_state state)
+__noreturn void cpu_startup_entry(enum cpuhp_state state)
 {
 	/*
 	 * This #ifdef needs to die, but it's too late in the cycle to
@@ -261,5 +289,6 @@ void cpu_startup_entry(enum cpuhp_state
 	boot_init_stack_canary();
 #endif
 	arch_cpu_idle_prepare();
-	cpu_idle_loop();
+	while (1)
+		do_idle();
 }
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -807,7 +807,6 @@ void tick_nohz_idle_enter(void)
 
 	local_irq_enable();
 }
-EXPORT_SYMBOL_GPL(tick_nohz_idle_enter);
 
 /**
  * tick_nohz_irq_exit - update next tick event from interrupt exit
@@ -934,7 +933,6 @@ void tick_nohz_idle_exit(void)
 
 	local_irq_enable();
 }
-EXPORT_SYMBOL_GPL(tick_nohz_idle_exit);
 
 static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
 {

[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] idle, thermal, acpi: Remove home grown idle implementations
  2014-06-04  8:54 [PATCH] idle, thermal, acpi: Remove home grown idle implementations Peter Zijlstra
@ 2014-06-04  8:58 ` Jacob Pan
  2014-06-04 21:34   ` Rafael J. Wysocki
  2014-06-05  6:55   ` Peter Zijlstra
  0 siblings, 2 replies; 7+ messages in thread
From: Jacob Pan @ 2014-06-04  8:58 UTC (permalink / raw
  To: Peter Zijlstra
  Cc: rafael.j.wysocki, linux-kernel, linux-pm, lenb, mingo, tglx, hpa,
	arjan, rui.zhang, luto

On Wed, 4 Jun 2014 10:54:18 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> 
> I'm still sitting on this patch. Jacub you were going to make it play
> nice with QoS?
> 
I had a patchset to work through system PM QOS and still maintain the
idle injection efficiency. When I saw you did not merge the patch
below, I thought you have abandoned it :)

The only issue as per our last discussion is the lack of notification
when PM QOS cannot be met. But that is intrinsic to PM QOS itself.

I also consulted with Arjan and looked at directly intercept with
intel_idle since both intel_powerclamp and intel_idle are arch specific
drivers. But I think that is hard to do at per idle period basis,
since we should still allow "natural" idle during the forced idle time.

So, I think we can take a two stepped approach,
1. integrate your patch with a
updated version of https://lkml.org/lkml/2013/11/26/534 such that there
is no performance/efficiency regression.
2. add notification mechanism to system qos when constraints cannot be
met.


Thanks,

Jacob
> ---
> Subject: idle, thermal, acpi: Remove home grown idle implementations
> From: Peter Zijlstra <peterz@infradead.org>
> Date: Wed Nov 20 14:32:37 CET 2013
> 
> People are starting to grow their own idle implementations in various
> disgusting ways. Collapse the lot and use the generic idle code to
> provide a proper idle cycle implementation.
> 
> This does not fully preseve existing behaviour in that the generic
> idle cycle function calls into the normal cpuidle governed idle
> routines and should thus respect things like QoS parameters and the
> like.
> 
> If people want to over-ride the idle state they should talk to the
> cpuidle folks about extending the interface and attempt to preserve
> QoS guarantees, instead of jumping straight to the deepest possible C
> state -- Jacub Pan said he was going to do this.
> 
> This is reported to work for intel_powerclamp by Jacub Pan, the
> acpi_pad driver is untested.
> 
> Cc: rui.zhang@intel.com
> Cc: jacob.jun.pan@linux.intel.com
> Cc: lenb@kernel.org
> Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> Cc: Ingo Molnar <mingo@kernel.org>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: hpa@zytor.com
> Cc: arjan@linux.intel.com
> Signed-off-by: Peter Zijlstra <peterz@infradead.org>
> ---
>  drivers/acpi/acpi_pad.c            |   41 -----------
>  drivers/thermal/intel_powerclamp.c |   38 ----------
>  include/linux/cpu.h                |    2 
>  include/linux/sched.h              |    3 
>  kernel/sched/core.c                |    9 ++
>  kernel/sched/idle.c                |  129
> ++++++++++++++++++++++---------------
> kernel/time/tick-sched.c           |    2 7 files changed, 95
> insertions(+), 129 deletions(-)
> 
> --- a/drivers/acpi/acpi_pad.c
> +++ b/drivers/acpi/acpi_pad.c
> @@ -40,9 +40,7 @@ static DEFINE_MUTEX(round_robin_lock);
>  static unsigned long power_saving_mwait_eax;
>  
>  static unsigned char tsc_detected_unstable;
> -static unsigned char tsc_marked_unstable;
>  static unsigned char lapic_detected_unstable;
> -static unsigned char lapic_marked_unstable;
>  
>  static void power_saving_mwait_init(void)
>  {
> @@ -152,10 +150,9 @@ static int power_saving_thread(void *dat
>  	unsigned int tsk_index = (unsigned long)data;
>  	u64 last_jiffies = 0;
>  
> -	sched_setscheduler(current, SCHED_RR, &param);
> +	sched_setscheduler(current, SCHED_FIFO, &param);
>  
>  	while (!kthread_should_stop()) {
> -		int cpu;
>  		u64 expire_time;
>  
>  		try_to_freeze();
> @@ -170,41 +167,7 @@ static int power_saving_thread(void *dat
>  
>  		expire_time = jiffies + HZ * (100 - idle_pct) / 100;
>  
> -		while (!need_resched()) {
> -			if (tsc_detected_unstable
> && !tsc_marked_unstable) {
> -				/* TSC could halt in idle, so notify
> users */
> -				mark_tsc_unstable("TSC halts in
> idle");
> -				tsc_marked_unstable = 1;
> -			}
> -			if (lapic_detected_unstable
> && !lapic_marked_unstable) {
> -				int i;
> -				/* LAPIC could halt in idle, so
> notify users */
> -				for_each_online_cpu(i)
> -					clockevents_notify(
> -
> CLOCK_EVT_NOTIFY_BROADCAST_ON,
> -						&i);
> -				lapic_marked_unstable = 1;
> -			}
> -			local_irq_disable();
> -			cpu = smp_processor_id();
> -			if (lapic_marked_unstable)
> -				clockevents_notify(
> -
> CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
> -			stop_critical_timings();
> -
> -
> mwait_idle_with_hints(power_saving_mwait_eax, 1); -
> -			start_critical_timings();
> -			if (lapic_marked_unstable)
> -				clockevents_notify(
> -
> CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
> -			local_irq_enable();
> -
> -			if (jiffies > expire_time) {
> -				do_sleep = 1;
> -				break;
> -			}
> -		}
> +		play_idle(expire_time);
>  
>  		/*
>  		 * current sched_rt has threshold for rt task
> running time. --- a/drivers/thermal/intel_powerclamp.c
> +++ b/drivers/thermal/intel_powerclamp.c
> @@ -256,11 +256,6 @@ static u64 pkg_state_counter(void)
>  	return count;
>  }
>  
> -static void noop_timer(unsigned long foo)
> -{
> -	/* empty... just the fact that we get the interrupt wakes us
> up */ -}
> -
>  static unsigned int get_compensation(int ratio)
>  {
>  	unsigned int comp = 0;
> @@ -365,7 +360,6 @@ static bool powerclamp_adjust_controls(u
>  static int clamp_thread(void *arg)
>  {
>  	int cpunr = (unsigned long)arg;
> -	DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0);
>  	static const struct sched_param param = {
>  		.sched_priority = MAX_USER_RT_PRIO/2,
>  	};
> @@ -374,11 +368,9 @@ static int clamp_thread(void *arg)
>  
>  	set_bit(cpunr, cpu_clamping_mask);
>  	set_freezable();
> -	init_timer_on_stack(&wakeup_timer);
>  	sched_setscheduler(current, SCHED_FIFO, &param);
>  
> -	while (true == clamping && !kthread_should_stop() &&
> -		cpu_online(cpunr)) {
> +	while (clamping && !kthread_should_stop() &&
> cpu_online(cpunr)) { int sleeptime;
>  		unsigned long target_jiffies;
>  		unsigned int guard;
> @@ -426,35 +418,11 @@ static int clamp_thread(void *arg)
>  		if (should_skip)
>  			continue;
>  
> -		target_jiffies = jiffies + duration_jiffies;
> -		mod_timer(&wakeup_timer, target_jiffies);
>  		if (unlikely(local_softirq_pending()))
>  			continue;
> -		/*
> -		 * stop tick sched during idle time, interrupts are
> still
> -		 * allowed. thus jiffies are updated properly.
> -		 */
> -		preempt_disable();
> -		tick_nohz_idle_enter();
> -		/* mwait until target jiffies is reached */
> -		while (time_before(jiffies, target_jiffies)) {
> -			unsigned long ecx = 1;
> -			unsigned long eax = target_mwait;
> -
> -			/*
> -			 * REVISIT: may call enter_idle() to notify
> drivers who
> -			 * can save power during cpu idle. same for
> exit_idle()
> -			 */
> -			local_touch_nmi();
> -			stop_critical_timings();
> -			mwait_idle_with_hints(eax, ecx);
> -			start_critical_timings();
> -			atomic_inc(&idle_wakeup_counter);
> -		}
> -		tick_nohz_idle_exit();
> -		preempt_enable();
> +
> +		play_idle(duration_jiffies);
>  	}
> -	del_timer_sync(&wakeup_timer);
>  	clear_bit(cpunr, cpu_clamping_mask);
>  
>  	return 0;
> --- a/include/linux/cpu.h
> +++ b/include/linux/cpu.h
> @@ -255,6 +255,8 @@ enum cpuhp_state {
>  	CPUHP_ONLINE,
>  };
>  
> +void play_idle(unsigned long jiffies);
> +
>  void cpu_startup_entry(enum cpuhp_state state);
>  void cpu_idle(void);
>  
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1892,6 +1892,7 @@ extern void thread_group_cputime_adjuste
>  /*
>   * Per process flags
>   */
> +#define PF_IDLE		0x00000002	/* I am an IDLE
> thread */ #define PF_EXITING	0x00000004	/* getting shut
> down */ #define PF_EXITPIDONE	0x00000008	/* pi exit
> done on shut down */ #define PF_VCPU
> 0x00000010	/* I'm a virtual CPU */ @@ -2204,7 +2205,7 @@
> extern struct task_struct *idle_task(int */
>  static inline bool is_idle_task(const struct task_struct *p)
>  {
> -	return p->pid == 0;
> +	return !!(p->flags & PF_IDLE);
>  }
>  extern struct task_struct *curr_task(int cpu);
>  extern void set_curr_task(int cpu, struct task_struct *p);
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -676,10 +676,12 @@ static void wake_up_idle_cpu(int cpu)
>  	if (cpu == smp_processor_id())
>  		return;
>  
> -	if (set_nr_and_not_polling(rq->idle))
> +	rcu_read_lock();
> +	if (set_nr_and_not_polling(rq->curr))
>  		smp_send_reschedule(cpu);
>  	else
>  		trace_sched_wake_polling_cpu(cpu);
> +	rcu_read_unlock();
>  }
>  
>  static bool wake_up_full_nohz_cpu(int cpu)
> @@ -1605,10 +1607,12 @@ static void ttwu_queue_remote(struct tas
>  	struct rq *rq = cpu_rq(cpu);
>  
>  	if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
> -		if (!set_nr_if_polling(rq->idle))
> +		rcu_read_lock();
> +		if (!set_nr_if_polling(rq->curr))
>  			smp_send_reschedule(cpu);
>  		else
>  			trace_sched_wake_polling_cpu(cpu);
> +		rcu_read_unlock();
>  	}
>  }
>  
> @@ -4537,6 +4541,7 @@ void init_idle(struct task_struct *idle,
>  	__sched_fork(0, idle);
>  	idle->state = TASK_RUNNING;
>  	idle->se.exec_start = sched_clock();
> +	idle->flags |= PF_IDLE;
>  
>  	do_set_cpus_allowed(idle, cpumask_of(cpu));
>  	/*
> --- a/kernel/sched/idle.c
> +++ b/kernel/sched/idle.c
> @@ -184,66 +184,94 @@ static void cpuidle_idle_call(void)
>   *
>   * Called with polling cleared.
>   */
> -static void cpu_idle_loop(void)
> +static void do_idle(void)
>  {
> -	while (1) {
> -		/*
> -		 * If the arch has a polling bit, we maintain an
> invariant:
> -		 *
> -		 * The polling bit is clear if we're not scheduled
> (i.e. if
> -		 * rq->curr != rq->idle).  This means that, if
> rq->idle has
> -		 * the polling bit set, then setting need_resched is
> -		 * guaranteed to cause the cpu to reschedule.
> -		 */
> +	/*
> +	 * If the arch has a polling bit, we maintain an invariant:
> +	 *
> +	 * The polling bit is clear if we're not scheduled (i.e. if
> rq->curr !=
> +	 * rq->idle).  This means that, if rq->idle has the polling
> bit set,
> +	 * then setting need_resched is guaranteed to cause the cpu
> to
> +	 * reschedule.
> +	 */
>  
> -		__current_set_polling();
> -		tick_nohz_idle_enter();
> +	__current_set_polling();
> +	tick_nohz_idle_enter();
>  
> -		while (!need_resched()) {
> -			check_pgt_cache();
> -			rmb();
> -
> -			if (cpu_is_offline(smp_processor_id()))
> -				arch_cpu_idle_dead();
> -
> -			local_irq_disable();
> -			arch_cpu_idle_enter();
> -
> -			/*
> -			 * In poll mode we reenable interrupts and
> spin.
> -			 *
> -			 * Also if we detected in the wakeup from
> idle
> -			 * path that the tick broadcast device
> expired
> -			 * for us, we don't want to go deep idle as
> we
> -			 * know that the IPI is going to arrive right
> -			 * away
> -			 */
> -			if (cpu_idle_force_poll ||
> tick_check_broadcast_expired())
> -				cpu_idle_poll();
> -			else
> -				cpuidle_idle_call();
> +	while (!need_resched()) {
> +		check_pgt_cache();
> +		rmb();
>  
> -			arch_cpu_idle_exit();
> -		}
> +		if (cpu_is_offline(smp_processor_id()))
> +			arch_cpu_idle_dead();
> +
> +		local_irq_disable();
> +		arch_cpu_idle_enter();
>  
>  		/*
> -		 * Since we fell out of the loop above, we know
> -		 * TIF_NEED_RESCHED must be set, propagate it into
> -		 * PREEMPT_NEED_RESCHED.
> +		 * In poll mode we reenable interrupts and spin.
>  		 *
> -		 * This is required because for polling idle loops
> we will
> -		 * not have had an IPI to fold the state for us.
> +		 * Also if we detected in the wakeup from idle path
> that the
> +		 * tick broadcast device expired for us, we don't
> want to go
> +		 * deep idle as we know that the IPI is going to
> arrive right
> +		 * away
>  		 */
> -		preempt_set_need_resched();
> -		tick_nohz_idle_exit();
> -		__current_clr_polling();
> -		smp_mb__after_clear_bit();
> -		sched_ttwu_pending();
> -		schedule_preempt_disabled();
> +		if (cpu_idle_force_poll ||
> tick_check_broadcast_expired())
> +			cpu_idle_poll();
> +		else
> +			cpuidle_idle_call();
> +
> +		arch_cpu_idle_exit();
>  	}
> +
> +	/*
> +	 * Since we fell out of the loop above, we know
> TIF_NEED_RESCHED must
> +	 * be set, propagate it into PREEMPT_NEED_RESCHED.
> +	 *
> +	 * This is required because for polling idle loops we will
> not have had
> +	 * an IPI to fold the state for us.
> +	 */
> +	preempt_set_need_resched();
> +	tick_nohz_idle_exit();
> +	__current_clr_polling();
> +	smp_mb__after_clear_bit();
> +	sched_ttwu_pending();
> +	schedule_preempt_disabled();
> +}
> +
> +static void play_idle_timer(unsigned long foo)
> +{
> +	set_tsk_need_resched(current);
> +}
> +
> +void play_idle(unsigned long duration)
> +{
> +	DEFINE_TIMER(wakeup_timer, play_idle_timer, 0, 0);
> +
> +	/*
> +	 * Only FIFO tasks can disable the tick since they don't
> need the forced
> +	 * preemption.
> +	 */
> +	WARN_ON_ONCE(current->policy != SCHED_FIFO);
> +	WARN_ON_ONCE(current->nr_cpus_allowed != 1);
> +	WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY));
> +	WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
> +	rcu_sleep_check();
> +
> +	init_timer_on_stack(&wakeup_timer);
> +	mod_timer_pinned(&wakeup_timer, jiffies + duration);
> +
> +	preempt_disable();
> +	current->flags |= PF_IDLE;
> +	do_idle();
> +	current->flags &= ~PF_IDLE;
> +	del_timer_sync(&wakeup_timer);
> +	preempt_fold_need_resched();
> +	preempt_enable();
>  }
> +EXPORT_SYMBOL_GPL(play_idle);
>  
> -void cpu_startup_entry(enum cpuhp_state state)
> +__noreturn void cpu_startup_entry(enum cpuhp_state state)
>  {
>  	/*
>  	 * This #ifdef needs to die, but it's too late in the cycle
> to @@ -261,5 +289,6 @@ void cpu_startup_entry(enum cpuhp_state
>  	boot_init_stack_canary();
>  #endif
>  	arch_cpu_idle_prepare();
> -	cpu_idle_loop();
> +	while (1)
> +		do_idle();
>  }
> --- a/kernel/time/tick-sched.c
> +++ b/kernel/time/tick-sched.c
> @@ -807,7 +807,6 @@ void tick_nohz_idle_enter(void)
>  
>  	local_irq_enable();
>  }
> -EXPORT_SYMBOL_GPL(tick_nohz_idle_enter);
>  
>  /**
>   * tick_nohz_irq_exit - update next tick event from interrupt exit
> @@ -934,7 +933,6 @@ void tick_nohz_idle_exit(void)
>  
>  	local_irq_enable();
>  }
> -EXPORT_SYMBOL_GPL(tick_nohz_idle_exit);
>  
>  static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
>  {

[Jacob Pan]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] idle, thermal, acpi: Remove home grown idle implementations
  2014-06-04  8:58 ` Jacob Pan
@ 2014-06-04 21:34   ` Rafael J. Wysocki
  2014-06-04 22:59     ` Jacob Pan
  2014-06-05  6:55   ` Peter Zijlstra
  1 sibling, 1 reply; 7+ messages in thread
From: Rafael J. Wysocki @ 2014-06-04 21:34 UTC (permalink / raw
  To: Jacob Pan
  Cc: Peter Zijlstra, rafael.j.wysocki, linux-kernel, linux-pm, lenb,
	mingo, tglx, hpa, arjan, rui.zhang, luto

On Wednesday, June 04, 2014 01:58:12 AM Jacob Pan wrote:
> On Wed, 4 Jun 2014 10:54:18 +0200
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > 
> > I'm still sitting on this patch. Jacub you were going to make it play
> > nice with QoS?
> > 
> I had a patchset to work through system PM QOS and still maintain the
> idle injection efficiency. When I saw you did not merge the patch
> below, I thought you have abandoned it :)
> 
> The only issue as per our last discussion is the lack of notification
> when PM QOS cannot be met. But that is intrinsic to PM QOS itself.
> 
> I also consulted with Arjan and looked at directly intercept with
> intel_idle since both intel_powerclamp and intel_idle are arch specific
> drivers. But I think that is hard to do at per idle period basis,
> since we should still allow "natural" idle during the forced idle time.
> 
> So, I think we can take a two stepped approach,
> 1. integrate your patch with a
> updated version of https://lkml.org/lkml/2013/11/26/534 such that there
> is no performance/efficiency regression.
> 2. add notification mechanism to system qos when constraints cannot be
> met.

And then there's a question about how the notification would be supposed to
work.  So I guess we can proceed with 1. and really leave 2. for some time
in the future ATM.

Rafael


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] idle, thermal, acpi: Remove home grown idle implementations
  2014-06-04 21:34   ` Rafael J. Wysocki
@ 2014-06-04 22:59     ` Jacob Pan
  2014-06-05  7:12       ` Peter Zijlstra
  0 siblings, 1 reply; 7+ messages in thread
From: Jacob Pan @ 2014-06-04 22:59 UTC (permalink / raw
  To: Rafael J. Wysocki
  Cc: Peter Zijlstra, rafael.j.wysocki, linux-kernel, linux-pm, lenb,
	mingo, tglx, hpa, arjan, rui.zhang, luto

On Wed, 04 Jun 2014 23:34:51 +0200
"Rafael J. Wysocki" <rjw@rjwysocki.net> wrote:

> On Wednesday, June 04, 2014 01:58:12 AM Jacob Pan wrote:
> > On Wed, 4 Jun 2014 10:54:18 +0200
> > Peter Zijlstra <peterz@infradead.org> wrote:
> > 
> > > 
> > > I'm still sitting on this patch. Jacub you were going to make it
> > > play nice with QoS?
> > > 
> > I had a patchset to work through system PM QOS and still maintain
> > the idle injection efficiency. When I saw you did not merge the
> > patch below, I thought you have abandoned it :)
> > 
> > The only issue as per our last discussion is the lack of
> > notification when PM QOS cannot be met. But that is intrinsic to PM
> > QOS itself.
> > 
> > I also consulted with Arjan and looked at directly intercept with
> > intel_idle since both intel_powerclamp and intel_idle are arch
> > specific drivers. But I think that is hard to do at per idle period
> > basis, since we should still allow "natural" idle during the forced
> > idle time.
> > 
> > So, I think we can take a two stepped approach,
> > 1. integrate your patch with a
> > updated version of https://lkml.org/lkml/2013/11/26/534 such that
> > there is no performance/efficiency regression.
> > 2. add notification mechanism to system qos when constraints cannot
> > be met.
> 
> And then there's a question about how the notification would be
> supposed to work.  So I guess we can proceed with 1. and really leave
> 2. for some time in the future ATM.
Sounds good. Let me test/integrate Peter's patch with PM QoS change,
powerclamp and acpipad then come up with a patchset.

Thanks,

Jacob

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] idle, thermal, acpi: Remove home grown idle implementations
  2014-06-04  8:58 ` Jacob Pan
  2014-06-04 21:34   ` Rafael J. Wysocki
@ 2014-06-05  6:55   ` Peter Zijlstra
  2014-06-06 15:47     ` Jacob Pan
  1 sibling, 1 reply; 7+ messages in thread
From: Peter Zijlstra @ 2014-06-05  6:55 UTC (permalink / raw
  To: Jacob Pan
  Cc: rafael.j.wysocki, linux-kernel, linux-pm, lenb, mingo, tglx, hpa,
	arjan, rui.zhang, luto

[-- Attachment #1: Type: text/plain, Size: 1269 bytes --]

On Wed, Jun 04, 2014 at 01:58:12AM -0700, Jacob Pan wrote:
> On Wed, 4 Jun 2014 10:54:18 +0200
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > 
> > I'm still sitting on this patch. Jacub you were going to make it play
> > nice with QoS?
> > 
> I had a patchset to work through system PM QOS and still maintain the
> idle injection efficiency. When I saw you did not merge the patch
> below, I thought you have abandoned it :)

I was waiting for you to do the QoS bits :-)

> The only issue as per our last discussion is the lack of notification
> when PM QOS cannot be met. But that is intrinsic to PM QOS itself.
> 
> I also consulted with Arjan and looked at directly intercept with
> intel_idle since both intel_powerclamp and intel_idle are arch specific
> drivers. But I think that is hard to do at per idle period basis,
> since we should still allow "natural" idle during the forced idle time.
> 
> So, I think we can take a two stepped approach,
> 1. integrate your patch with a
> updated version of https://lkml.org/lkml/2013/11/26/534 such that there
> is no performance/efficiency regression.
> 2. add notification mechanism to system qos when constraints cannot be
> met.

That's fine with me; can you respin those bits?

[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] idle, thermal, acpi: Remove home grown idle implementations
  2014-06-04 22:59     ` Jacob Pan
@ 2014-06-05  7:12       ` Peter Zijlstra
  0 siblings, 0 replies; 7+ messages in thread
From: Peter Zijlstra @ 2014-06-05  7:12 UTC (permalink / raw
  To: Jacob Pan
  Cc: Rafael J. Wysocki, rafael.j.wysocki, linux-kernel, linux-pm, lenb,
	mingo, tglx, hpa, arjan, rui.zhang, luto

[-- Attachment #1: Type: text/plain, Size: 12797 bytes --]

On Wed, Jun 04, 2014 at 03:59:20PM -0700, Jacob Pan wrote:
> Sounds good. Let me test/integrate Peter's patch with PM QoS change,
> powerclamp and acpipad then come up with a patchset.

Slight change since yesterday, it applies on top of:

  git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git sched/core

Note that that tree isn't stable, its generated from my quilt series and
completely regenerated every time I update.

Ideally those things would soon make their wait into tip, but Ingo's
busy with real life stuff for a bit.

---
Subject: idle, thermal, acpi: Remove home grown idle implementations
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed Nov 20 14:32:37 CET 2013

People are starting to grow their own idle implementations in various
disgusting ways. Collapse the lot and use the generic idle code to
provide a proper idle cycle implementation.

This does not fully preseve existing behaviour in that the generic
idle cycle function calls into the normal cpuidle governed idle
routines and should thus respect things like QoS parameters and the
like.

If people want to over-ride the idle state they should talk to the
cpuidle folks about extending the interface and attempt to preserve
QoS guarantees, instead of jumping straight to the deepest possible C
state -- Jacub Pan said he was going to do this.

This is reported to work for intel_powerclamp by Jacub Pan, the
acpi_pad driver is untested.

Cc: rui.zhang@intel.com
Cc: jacob.jun.pan@linux.intel.com
Cc: lenb@kernel.org
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: hpa@zytor.com
Cc: arjan@linux.intel.com
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
---
 drivers/acpi/acpi_pad.c            |   41 ----------
 drivers/thermal/intel_powerclamp.c |   38 ---------
 include/linux/cpu.h                |    2 
 include/linux/sched.h              |    3 
 kernel/sched/core.c                |    1 
 kernel/sched/idle.c                |  143 ++++++++++++++++++++++---------------
 kernel/time/tick-sched.c           |    2 
 kernel/trace/trace_irqsoff.c       |    2 
 8 files changed, 97 insertions(+), 135 deletions(-)

--- a/drivers/acpi/acpi_pad.c
+++ b/drivers/acpi/acpi_pad.c
@@ -40,9 +40,7 @@ static DEFINE_MUTEX(round_robin_lock);
 static unsigned long power_saving_mwait_eax;
 
 static unsigned char tsc_detected_unstable;
-static unsigned char tsc_marked_unstable;
 static unsigned char lapic_detected_unstable;
-static unsigned char lapic_marked_unstable;
 
 static void power_saving_mwait_init(void)
 {
@@ -152,10 +150,9 @@ static int power_saving_thread(void *dat
 	unsigned int tsk_index = (unsigned long)data;
 	u64 last_jiffies = 0;
 
-	sched_setscheduler(current, SCHED_RR, &param);
+	sched_setscheduler(current, SCHED_FIFO, &param);
 
 	while (!kthread_should_stop()) {
-		int cpu;
 		u64 expire_time;
 
 		try_to_freeze();
@@ -170,41 +167,7 @@ static int power_saving_thread(void *dat
 
 		expire_time = jiffies + HZ * (100 - idle_pct) / 100;
 
-		while (!need_resched()) {
-			if (tsc_detected_unstable && !tsc_marked_unstable) {
-				/* TSC could halt in idle, so notify users */
-				mark_tsc_unstable("TSC halts in idle");
-				tsc_marked_unstable = 1;
-			}
-			if (lapic_detected_unstable && !lapic_marked_unstable) {
-				int i;
-				/* LAPIC could halt in idle, so notify users */
-				for_each_online_cpu(i)
-					clockevents_notify(
-						CLOCK_EVT_NOTIFY_BROADCAST_ON,
-						&i);
-				lapic_marked_unstable = 1;
-			}
-			local_irq_disable();
-			cpu = smp_processor_id();
-			if (lapic_marked_unstable)
-				clockevents_notify(
-					CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
-			stop_critical_timings();
-
-			mwait_idle_with_hints(power_saving_mwait_eax, 1);
-
-			start_critical_timings();
-			if (lapic_marked_unstable)
-				clockevents_notify(
-					CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
-			local_irq_enable();
-
-			if (jiffies > expire_time) {
-				do_sleep = 1;
-				break;
-			}
-		}
+		play_idle(expire_time);
 
 		/*
 		 * current sched_rt has threshold for rt task running time.
--- a/drivers/thermal/intel_powerclamp.c
+++ b/drivers/thermal/intel_powerclamp.c
@@ -256,11 +256,6 @@ static u64 pkg_state_counter(void)
 	return count;
 }
 
-static void noop_timer(unsigned long foo)
-{
-	/* empty... just the fact that we get the interrupt wakes us up */
-}
-
 static unsigned int get_compensation(int ratio)
 {
 	unsigned int comp = 0;
@@ -365,7 +360,6 @@ static bool powerclamp_adjust_controls(u
 static int clamp_thread(void *arg)
 {
 	int cpunr = (unsigned long)arg;
-	DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0);
 	static const struct sched_param param = {
 		.sched_priority = MAX_USER_RT_PRIO/2,
 	};
@@ -374,11 +368,9 @@ static int clamp_thread(void *arg)
 
 	set_bit(cpunr, cpu_clamping_mask);
 	set_freezable();
-	init_timer_on_stack(&wakeup_timer);
 	sched_setscheduler(current, SCHED_FIFO, &param);
 
-	while (true == clamping && !kthread_should_stop() &&
-		cpu_online(cpunr)) {
+	while (clamping && !kthread_should_stop() && cpu_online(cpunr)) {
 		int sleeptime;
 		unsigned long target_jiffies;
 		unsigned int guard;
@@ -426,35 +418,11 @@ static int clamp_thread(void *arg)
 		if (should_skip)
 			continue;
 
-		target_jiffies = jiffies + duration_jiffies;
-		mod_timer(&wakeup_timer, target_jiffies);
 		if (unlikely(local_softirq_pending()))
 			continue;
-		/*
-		 * stop tick sched during idle time, interrupts are still
-		 * allowed. thus jiffies are updated properly.
-		 */
-		preempt_disable();
-		tick_nohz_idle_enter();
-		/* mwait until target jiffies is reached */
-		while (time_before(jiffies, target_jiffies)) {
-			unsigned long ecx = 1;
-			unsigned long eax = target_mwait;
-
-			/*
-			 * REVISIT: may call enter_idle() to notify drivers who
-			 * can save power during cpu idle. same for exit_idle()
-			 */
-			local_touch_nmi();
-			stop_critical_timings();
-			mwait_idle_with_hints(eax, ecx);
-			start_critical_timings();
-			atomic_inc(&idle_wakeup_counter);
-		}
-		tick_nohz_idle_exit();
-		preempt_enable();
+
+		play_idle(duration_jiffies);
 	}
-	del_timer_sync(&wakeup_timer);
 	clear_bit(cpunr, cpu_clamping_mask);
 
 	return 0;
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -255,6 +255,8 @@ enum cpuhp_state {
 	CPUHP_ONLINE,
 };
 
+void play_idle(unsigned long jiffies);
+
 void cpu_startup_entry(enum cpuhp_state state);
 void cpu_idle(void);
 
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1892,6 +1892,7 @@ extern void thread_group_cputime_adjuste
 /*
  * Per process flags
  */
+#define PF_IDLE		0x00000002	/* I am an IDLE thread */
 #define PF_EXITING	0x00000004	/* getting shut down */
 #define PF_EXITPIDONE	0x00000008	/* pi exit done on shut down */
 #define PF_VCPU		0x00000010	/* I'm a virtual CPU */
@@ -2204,7 +2205,7 @@ extern struct task_struct *idle_task(int
  */
 static inline bool is_idle_task(const struct task_struct *p)
 {
-	return p->pid == 0;
+	return !!(p->flags & PF_IDLE);
 }
 extern struct task_struct *curr_task(int cpu);
 extern void set_curr_task(int cpu, struct task_struct *p);
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4537,6 +4537,7 @@ void init_idle(struct task_struct *idle,
 	__sched_fork(0, idle);
 	idle->state = TASK_RUNNING;
 	idle->se.exec_start = sched_clock();
+	idle->flags |= PF_IDLE;
 
 	do_set_cpus_allowed(idle, cpumask_of(cpu));
 	/*
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -184,72 +184,102 @@ static void cpuidle_idle_call(void)
  *
  * Called with polling cleared.
  */
-static void cpu_idle_loop(void)
+static void do_idle(void)
 {
-	while (1) {
-		/*
-		 * If the arch has a polling bit, we maintain an invariant:
-		 *
-		 * Our polling bit is clear if we're not scheduled (i.e. if
-		 * rq->curr != rq->idle).  This means that, if rq->idle has
-		 * the polling bit set, then setting need_resched is
-		 * guaranteed to cause the cpu to reschedule.
-		 */
+	/*
+	 * If the arch has a polling bit, we maintain an invariant:
+	 *
+	 * Our polling bit is clear if we're not scheduled (i.e. if
+	 * rq->curr != rq->idle).  This means that, if rq->idle has
+	 * the polling bit set, then setting need_resched is
+	 * guaranteed to cause the cpu to reschedule.
+	 */
 
-		__current_set_polling();
-		tick_nohz_idle_enter();
+	__current_set_polling();
+	tick_nohz_idle_enter();
 
-		while (!need_resched()) {
-			check_pgt_cache();
-			rmb();
-
-			if (cpu_is_offline(smp_processor_id()))
-				arch_cpu_idle_dead();
-
-			local_irq_disable();
-			arch_cpu_idle_enter();
-
-			/*
-			 * In poll mode we reenable interrupts and spin.
-			 *
-			 * Also if we detected in the wakeup from idle
-			 * path that the tick broadcast device expired
-			 * for us, we don't want to go deep idle as we
-			 * know that the IPI is going to arrive right
-			 * away
-			 */
-			if (cpu_idle_force_poll || tick_check_broadcast_expired())
-				cpu_idle_poll();
-			else
-				cpuidle_idle_call();
+	while (!need_resched()) {
+		check_pgt_cache();
+		rmb();
 
-			arch_cpu_idle_exit();
-		}
+		if (cpu_is_offline(smp_processor_id()))
+			arch_cpu_idle_dead();
 
-		/*
-		 * Since we fell out of the loop above, we know
-		 * TIF_NEED_RESCHED must be set, propagate it into
-		 * PREEMPT_NEED_RESCHED.
-		 *
-		 * This is required because for polling idle loops we will
-		 * not have had an IPI to fold the state for us.
-		 */
-		preempt_set_need_resched();
-		tick_nohz_idle_exit();
-		__current_clr_polling();
+		local_irq_disable();
+		arch_cpu_idle_enter();
 
 		/*
-		 * We promise to call sched_ttwu_pending and reschedule
-		 * if need_resched is set while polling is set.  That
-		 * means that clearing polling needs to be visible
-		 * before doing these things.
+		 * In poll mode we reenable interrupts and spin.
+		 *
+		 * Also if we detected in the wakeup from idle
+		 * path that the tick broadcast device expired
+		 * for us, we don't want to go deep idle as we
+		 * know that the IPI is going to arrive right
+		 * away
 		 */
-		smp_mb__after_atomic();
+		if (cpu_idle_force_poll || tick_check_broadcast_expired())
+			cpu_idle_poll();
+		else
+			cpuidle_idle_call();
 
-		sched_ttwu_pending();
-		schedule_preempt_disabled();
+		arch_cpu_idle_exit();
 	}
+
+	/*
+	 * Since we fell out of the loop above, we know
+	 * TIF_NEED_RESCHED must be set, propagate it into
+	 * PREEMPT_NEED_RESCHED.
+	 *
+	 * This is required because for polling idle loops we will
+	 * not have had an IPI to fold the state for us.
+	 */
+	preempt_set_need_resched();
+	tick_nohz_idle_exit();
+	__current_clr_polling();
+
+	/*
+	 * We promise to call sched_ttwu_pending and reschedule
+	 * if need_resched is set while polling is set.  That
+	 * means that clearing polling needs to be visible
+	 * before doing these things.
+	 */
+	smp_mb__after_atomic();
+
+	sched_ttwu_pending();
+	schedule_preempt_disabled();
+}
+
+static void play_idle_timer(unsigned long foo)
+{
+	set_tsk_need_resched(current);
+}
+
+void play_idle(unsigned long duration)
+{
+	DEFINE_TIMER(wakeup_timer, play_idle_timer, 0, 0);
+
+	/*
+	 * Only FIFO tasks can disable the tick since they don't need the forced
+	 * preemption.
+	 */
+	WARN_ON_ONCE(current->policy != SCHED_FIFO);
+	WARN_ON_ONCE(current->nr_cpus_allowed != 1);
+	WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY));
+	WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
+	rcu_sleep_check();
+
+	init_timer_on_stack(&wakeup_timer);
+	mod_timer_pinned(&wakeup_timer, jiffies + duration);
+
+	preempt_disable();
+	current->flags |= PF_IDLE;
+	do_idle();
+	current->flags &= ~PF_IDLE;
+	del_timer_sync(&wakeup_timer);
+	preempt_fold_need_resched();
+	preempt_enable();
 }
+EXPORT_SYMBOL_GPL(play_idle);
 
 void cpu_startup_entry(enum cpuhp_state state)
 {
@@ -269,5 +299,6 @@ void cpu_startup_entry(enum cpuhp_state
 	boot_init_stack_canary();
 #endif
 	arch_cpu_idle_prepare();
-	cpu_idle_loop();
+	while (1)
+		do_idle();
 }
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -807,7 +807,6 @@ void tick_nohz_idle_enter(void)
 
 	local_irq_enable();
 }
-EXPORT_SYMBOL_GPL(tick_nohz_idle_enter);
 
 /**
  * tick_nohz_irq_exit - update next tick event from interrupt exit
@@ -934,7 +933,6 @@ void tick_nohz_idle_exit(void)
 
 	local_irq_enable();
 }
-EXPORT_SYMBOL_GPL(tick_nohz_idle_exit);
 
 static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
 {

[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] idle, thermal, acpi: Remove home grown idle implementations
  2014-06-05  6:55   ` Peter Zijlstra
@ 2014-06-06 15:47     ` Jacob Pan
  0 siblings, 0 replies; 7+ messages in thread
From: Jacob Pan @ 2014-06-06 15:47 UTC (permalink / raw
  To: Peter Zijlstra
  Cc: rafael.j.wysocki, linux-kernel, linux-pm, lenb, mingo, tglx, hpa,
	arjan, rui.zhang, luto

On Thu, 5 Jun 2014 08:55:20 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Wed, Jun 04, 2014 at 01:58:12AM -0700, Jacob Pan wrote:
> > On Wed, 4 Jun 2014 10:54:18 +0200
> > Peter Zijlstra <peterz@infradead.org> wrote:
> > 
> > > 
> > > I'm still sitting on this patch. Jacub you were going to make it
> > > play nice with QoS?
> > > 
> > I had a patchset to work through system PM QOS and still maintain
> > the idle injection efficiency. When I saw you did not merge the
> > patch below, I thought you have abandoned it :)
> 
> I was waiting for you to do the QoS bits :-)
> 
> > The only issue as per our last discussion is the lack of
> > notification when PM QOS cannot be met. But that is intrinsic to PM
> > QOS itself.
> > 
> > I also consulted with Arjan and looked at directly intercept with
> > intel_idle since both intel_powerclamp and intel_idle are arch
> > specific drivers. But I think that is hard to do at per idle period
> > basis, since we should still allow "natural" idle during the forced
> > idle time.
> > 
> > So, I think we can take a two stepped approach,
> > 1. integrate your patch with a
> > updated version of https://lkml.org/lkml/2013/11/26/534 such that
> > there is no performance/efficiency regression.
> > 2. add notification mechanism to system qos when constraints cannot
> > be met.
> 
> That's fine with me; can you respin those bits?

yes, working on it. it may take some time since lots of testing needed.
will include acpipad as well.

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2014-06-06 15:48 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-06-04  8:54 [PATCH] idle, thermal, acpi: Remove home grown idle implementations Peter Zijlstra
2014-06-04  8:58 ` Jacob Pan
2014-06-04 21:34   ` Rafael J. Wysocki
2014-06-04 22:59     ` Jacob Pan
2014-06-05  7:12       ` Peter Zijlstra
2014-06-05  6:55   ` Peter Zijlstra
2014-06-06 15:47     ` Jacob Pan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.